fis-gtm/sr_port/patstr.c

873 lines
26 KiB
C

/****************************************************************
* *
* Copyright 2001, 2010 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "compiler.h"
#include "patcode.h"
#include "toktyp.h"
#include "copy.h"
#include "min_max.h"
#ifdef UNICODE_SUPPORTED
#include "gtm_utf8.h" /* needed for UTF8_MBNEXT macro */
#endif
GBLREF uint4 mapbit[];
GBLREF boolean_t gtm_utf8_mode;
LITREF char ctypetab[NUM_CHARS];
LITREF uint4 typemask[PATENTS];
typedef struct
{
unsigned char *next;
ptstr altpat;
} alternation;
/* This procedure is part of the MUMPS compiler. The function of this procedure is to parse a pattern specification
* and compile it into a data structure that will be used by the run-time engine to actually attempt to match the pattern.
*
* The data structure that is built in 'obj' to describe the pattern is stored in units of [unsigned int4]s.
* However it is treated as an mstr with a character count by all but the specifically pattern match modules.
*
* The contents of the table looks like:
*
* [0] flag: non-zero if the pattern is "fixed" (3n1"abc"5a is "fixed", 3.5n is not)
* [1] counter: amount of space used for pattern masks and string-buffers (in units of cell-size)
* [...] space for additional buffers. If the pattern contains strings, the text for those strings are stored in this space.
* If the pattern contains "alternations", the specifications of these are stored in this area.
* Each "alternation" specifier is a (recursive) instance of this table.
* [n] counter: number of pattern specifications
* [n+1] total number of characters in specified patterns
* [n+2] maximum number of characters in specified patterns
* [n+3] min[0]: first element of array containing the minimum numbers for the repeat-counts
* ...
* [n+3+count-1]
*
* *** only if pattern is not "fixed" ***
* [m] = [n+3+count]
* [m] max[0]: first element of array containing the maximum numbers for the repeat-counts
* ...
* [m+count-1]
*
* *** always ***
* [p] = [n+3+count+count] or [n+3+count]
* [p] size[0]: first element of array containing sizes
* ...
* [p+count-1]
*
*======================================================================
*
* Pattern specifications are compiled by this procedure (patstr).
* Run-time evaluation occurs through one of three possible evaluators:
* 1. For "fixed" patterns: do_patfixed ("Fixed" patterns look like 3a2n1p)
* 2. Other patterns go through do_pattern (These are patterns like 1.3a2.5n1.6p)
* 3. A special case are patterns that have more than 1 pattern code with an indeterminate upper bound
* Those patterns are processed using the DFA algorithm.
*/
int patstr(mstr *instr, ptstr *obj, unsigned char **relay)
{
pat_strlit strlit;
boolean_t infinite, split_atom, done, dfa, fixed_len, prev_fixed_len;
int4 lower_bound, upper_bound, alloclen;
gtm_uint64_t bound;
unsigned char curchar, symbol, *inchar, *in_top, *buffptr;
uint4 pattern_mask, last_leaf_mask, y_max, mbit;
uint4 *patmaskptr;
int atom_map, count, total_min, total_max;
int min[MAX_PATTERN_ATOMS], max[MAX_PATTERN_ATOMS], size[MAX_PATTERN_ATOMS];
struct leaf leaves, *lv_ptr;
struct e_table expand, *exp_ptr;
int exp_temp[CHAR_CLASSES];
int leaf_num, curr_leaf_num, min_dfa, curr_min_dfa, sym_num;
int seqcnt, charpos, leafcnt, cursize;
int4 bitpos;
alternation init_alt;
alternation *cur_alt;
mstr alttail;
int4 status;
int4 altactive = 0;
int4 altend;
char *saveinstr;
int chidx;
int bit;
int seq;
int altmin, altmax;
int saw_delimiter = 0;
int4 altlen, bytelen;
int4 allmask;
boolean_t last_infinite;
boolean_t done_free;
unsigned char *let_go;
uint4 *fstchar, *outchar, *lastpatptr;
int any_alt = FALSE;
int altcount, altsimplify;
int low_in, high_in, size_in, jump;
boolean_t topseen = FALSE;/* If TRUE it means we found inchar to be == in_top and so did NOT scan the NEXT
* byte in inchar (to be stored in curchar). Therefore from this point onwards,
* "curchar" should never be used in this function. This is also asserted below.
*/
error_def(ERR_PATCODE);
error_def(ERR_PATUPPERLIM);
error_def(ERR_PATCLASS);
error_def(ERR_PATLIT);
error_def(ERR_PATMAXLEN);
if (0 == instr->len) /* empty pattern string. Cant do much */
{
instr->addr++; /* Return 1 byte more for compile_pattern to properly compute erroring M source column */
return ERR_PATCODE;
}
memset(&leaves, 0, SIZEOF(leaves));
memset(&expand, 0, SIZEOF(expand));
init_alt.next = NULL;
init_alt.altpat.len = 0;
done_free = TRUE;
fstchar = &obj->buff[0];
saveinstr = (char *) &instr->addr[0];
for (allmask = 0, chidx = 'A'; chidx <= 'X'; chidx++)
allmask |= mapbit[chidx - 'A'];
outchar = &obj->buff[PAT_MASK_BEGIN_OFFSET]; /* Note: offset is actually PAT_MASK_BEGIN_OFFSET * SIZEOF (uint4) bytes */
last_leaf_mask = *outchar = 0;
patmaskptr = lastpatptr = outchar;
infinite = last_infinite = FALSE;
dfa = split_atom = FALSE;
fixed_len = TRUE;
count = total_min = total_max = atom_map = 0;
lv_ptr = &leaves;
exp_ptr = &expand;
inchar = (unsigned char *)instr->addr;
in_top = (unsigned char *)&inchar[instr->len];
assert(inchar < in_top);
curchar = *inchar++;
altactive = 0;
for (;;)
{
assert(inchar <= in_top);
altend = 0;
prev_fixed_len = fixed_len;
if ((NULL != relay) && !saw_delimiter)
{
assert(!topseen);
if ((',' == curchar) || (')' == curchar))
{
*relay = (inchar - 1);
altend = 1;
}
}
saw_delimiter = 0;
if (!altactive || altend)
{
instr->addr = (char*)inchar;
if (!topseen && (TK_PERIOD == ctypetab[curchar]))
{
lower_bound = 0;
fixed_len = FALSE;
} else if (!topseen && (TK_DIGIT == ctypetab[curchar]))
{
lower_bound = curchar - '0';
for ( ; ; )
{
if (inchar >= in_top)
{
assert(inchar == in_top);
topseen = TRUE;
break;
}
curchar = *inchar++;
if (TK_DIGIT != ctypetab[curchar])
break;
if (PAT_MAX_REPEAT < (lower_bound = (lower_bound * 10) + (curchar - '0')))
lower_bound = PAT_MAX_REPEAT;
}
infinite = FALSE;
} else
{
if (dfa)
{
patmaskptr = outchar;
cursize = dfa_calc(lv_ptr, leaf_num, exp_ptr, &fstchar, &outchar);
if (cursize >= 0)
{
min[count] = min_dfa;
max[count] = PAT_MAX_REPEAT;
size[count] = cursize;
total_min = MIN((total_min + (min[count] * size[count])), PAT_MAX_REPEAT);
total_max = MIN((total_max + (max[count] * size[count])), PAT_MAX_REPEAT);
lastpatptr = patmaskptr;
last_infinite = TRUE;
count++;
} else
{
outchar = patmaskptr;
if (!pat_unwind(&count, lv_ptr, leaf_num, &total_min, &total_max,
&min[0], &max[0], &size[0], altmin, altmax,
&last_infinite, &fstchar, &outchar, &lastpatptr))
{
instr->addr = (char *)inchar;
assert(FALSE);
return ERR_PATMAXLEN;
}
}
}
if (outchar == &obj->buff[PAT_MASK_BEGIN_OFFSET])
{
instr->addr = (char *)inchar;
return ERR_PATCODE;
}
patmaskptr = &obj->buff[0];
*patmaskptr++ = fixed_len;
*patmaskptr = (uint4)(outchar - patmaskptr); /* unit is SIZEOF(uint4) */
*outchar++ = count;
*outchar++ = total_min;
*outchar++ = total_max;
for (seqcnt = 0; seqcnt < count; seqcnt++)
*outchar++ = min[seqcnt];
if (!fixed_len)
for (seqcnt = 0; seqcnt < count; seqcnt++)
*outchar++ = max[seqcnt];
for (seqcnt = 0; seqcnt < count; seqcnt++)
*outchar++ = size[seqcnt];
obj->len = (int4)(outchar - &obj->buff[0]);
assert(!topseen || (inchar == in_top));
assert(inchar <= in_top);
instr->addr = (topseen ? (char *)inchar : (char *)inchar - 1);
return 0;
}
if (!topseen && (curchar != '.'))
upper_bound = lower_bound;
else
{
fixed_len = FALSE;
if (inchar >= in_top)
{
assert(inchar == in_top);
instr->addr = (char *)inchar + 1;
return ERR_PATCLASS;
}
assert(!topseen);
if (ctypetab[curchar = *inchar++] != TK_DIGIT)
{
if (lower_bound > 0)
{ /* A pattern atom like 5.A will be split into two atoms:
* (i) the first will be a fixed length one (5A);
* (ii) the second one will be a completely indefinite one (.A).
* This split allows the run-time engine to separate out the
* fixed part from the indefinite part.
*/
split_atom = TRUE;
} else
{
infinite = TRUE;
upper_bound = PAT_MAX_REPEAT;
}
} else
{
infinite = FALSE;
instr->addr = (char *)inchar;
upper_bound = curchar - '0';
for ( ; ; )
{
if (inchar >= in_top)
{
assert(inchar == in_top);
topseen = TRUE;
break;
}
curchar = *inchar++;
if (TK_DIGIT != ctypetab[curchar])
break;
if (PAT_MAX_REPEAT < (upper_bound = (upper_bound * 10) + (curchar - '0')))
upper_bound = PAT_MAX_REPEAT;
}
if (upper_bound < lower_bound)
{
instr->addr = (char *)inchar;
return ERR_PATUPPERLIM;
}
}
}
instr->addr = (char *)inchar;
if (count >= MAX_PATTERN_ATOMS)
{
assert(FALSE);
return ERR_PATMAXLEN;
}
}
if (!altend)
{
if (!topseen && ('\"' == curchar))
{
pattern_mask = PATM_STRLIT;
strlit.bytelen= 0;
strlit.charlen= 0;
strlit.flags = 0;
alloclen = (SIZEOF(strlit.buff) / SIZEOF(strlit.buff[0]));
buffptr = &strlit.buff[0];
for (;;)
{
if (inchar >= in_top)
{
assert(inchar == in_top);
instr->addr = (char *)inchar + 1;
return ERR_PATLIT;
}
curchar = *inchar;
if ('\"' == curchar)
{
if (++inchar >= in_top)
{
assert(inchar == in_top);
topseen = TRUE;
break;
}
if ((curchar = *inchar) != '\"')
{
inchar++;
break;
}
}
if (!gtm_utf8_mode)
bytelen = 1;
UNICODE_ONLY(
else
{
if (!UTF8_VALID(inchar, in_top, bytelen))
{
instr->addr = (char *)inchar;
return ERR_PATLIT;
}
assert(1 <= bytelen);
}
)
if (!IS_ASCII(curchar))
strlit.flags |= PATM_STRLIT_NONASCII;
strlit.bytelen += bytelen;
if (strlit.bytelen >= alloclen)
{
instr->addr = (char *)inchar;
assert(FALSE);
return ERR_PATMAXLEN;
}
do
{
assert(inchar < in_top);
*buffptr++ = *inchar++;
} while (0 < --bytelen);
strlit.charlen++;
}
assert((strlit.flags & PATM_STRLIT_NONASCII) || strlit.bytelen == strlit.charlen);
assert((strlit.flags & PATM_STRLIT_NONASCII) || !(strlit.flags & PATM_STRLIT_BADCHAR));
if (!strlit.charlen)
{
lower_bound = upper_bound = 0;
infinite = FALSE;
fixed_len = prev_fixed_len;
split_atom = FALSE;
}
} else if (!topseen && ('(' == curchar))
{ /* start of 'alternation' */
if (dfa)
{
if (!pat_unwind(&count, lv_ptr, leaf_num, &total_min, &total_max,
&min[0], &max[0], &size[0], altmin, altmax,
&last_infinite, &fstchar, &outchar, &lastpatptr))
{
instr->addr = (char *)inchar;
assert(FALSE);
return ERR_PATMAXLEN;
}
dfa = FALSE;
}
if (inchar >= in_top)
{
assert(inchar == in_top);
instr->addr = (char *)inchar + 1;
return ERR_PATCODE;
}
pattern_mask = PATM_ALT;
cur_alt = &init_alt;
alttail.addr = (char *)inchar;
alttail.len = instr->len - (int4)((char *)inchar - saveinstr);
status = patstr(&alttail, &cur_alt->altpat, &inchar);
if (status)
{
instr->addr = (char *)alttail.addr;
return status;
}
saw_delimiter = 1;
altlen = cur_alt->altpat.buff[PAT_LEN_OFFSET];
altmin = cur_alt->altpat.buff[PAT_TOT_MIN_OFFSET(altlen)];
altmax = cur_alt->altpat.buff[PAT_TOT_MAX_OFFSET(altlen)];
altcount = 1;
any_alt = TRUE;
assert(inchar < in_top);
curchar = *inchar++;
altactive = 1;
continue;
} else if (!topseen && (',' == curchar))
{ /* separator between alternate possibilities */
/* The malloc that is requested here will be freed below when the alternation is
* added to the output data structure (just below the call to add_atom).
*/
if (!altactive)
{
instr->addr = (char *)inchar;
return ERR_PATCLASS;
}
if (inchar >= in_top)
{
assert(inchar == in_top);
instr->addr = (char *)inchar + 1;
return ERR_PATCODE;
}
cur_alt->next = (unsigned char *)malloc(SIZEOF(alternation));
cur_alt = (alternation *)cur_alt->next;
cur_alt->next = NULL;
done_free = FALSE;
alttail.addr = (char *)inchar;
alttail.len = instr->len - (int4)((char *)inchar - saveinstr);
status = patstr(&alttail, &cur_alt->altpat, &inchar);
if (status)
{
instr->addr = (char *)alttail.addr;
return status;
}
saw_delimiter = 1;
altlen = cur_alt->altpat.buff[PAT_LEN_OFFSET];
if (cur_alt->altpat.buff[PAT_TOT_MIN_OFFSET(altlen)] < altmin)
altmin = cur_alt->altpat.buff[PAT_TOT_MIN_OFFSET(altlen)];
if (cur_alt->altpat.buff[PAT_TOT_MAX_OFFSET(altlen)] > altmax)
altmax = cur_alt->altpat.buff[PAT_TOT_MAX_OFFSET(altlen)];
altcount++;
assert(inchar < in_top);
curchar = *inchar++;
continue;
} else if (!topseen && (')' == curchar))
{ /* end of 'alternation' */
if (!altactive)
{
instr->addr = (char *)inchar;
return ERR_PATCLASS;
}
altactive = 0;
if (inchar < in_top)
curchar = *inchar++;
else
{ /* We cannot do curchar = *inchar++ in this case since we are beyond the input bounds */
assert(inchar == in_top);
topseen = TRUE;
assert(!dfa);
}
} else
{
if (topseen)
{
instr->addr = (char *)inchar + 1;
return ERR_PATCLASS;
}
pattern_mask = 0;
do
{
chidx = (curchar > 'Z') ? curchar - 'a' : curchar - 'A';
if ((0 <= chidx) && (chidx <= 'X' - 'A'))
pattern_mask |= mapbit[chidx];
else if (('Y' - 'A' == chidx) || ('Z' - 'A' == chidx))
{ /* YxxxY and ZxxxZ codes not yet implemented */
instr->addr = (char *)inchar;
return ERR_PATCLASS;
} else
{
assert(TK_UPPER != ctypetab[curchar] && TK_LOWER != ctypetab[curchar]);
break;
}
if (inchar >= in_top)
{
topseen = TRUE;
assert(inchar == in_top);
break;
}
curchar = *inchar++;
} while (TRUE);
if (0 == pattern_mask)
{
instr->addr = topseen ? (char *)inchar + 1 : (char *)inchar;
return ERR_PATCLASS;
}
}
}
if (split_atom)
{
assert(FALSE == infinite);
upper_bound = lower_bound;
}
done = FALSE;
while (!done)
{
done = TRUE;
/* DFAs can be used within alternations, but not at the nesting level where the alternations themselves
* occur. Also, strings with a length of 0 characters should not be processed within DFAs, since there
* are no character cells to hold the mask and flag bits that the DFA code needs. Also strings with
* non-ASCII UTF-8 byte sequences are currently not processed through the DFA logic.
*/
if (infinite && !any_alt && !dfa
&& (!(pattern_mask & PATM_STRLIT) || (strlit.charlen && !(strlit.flags & PATM_STRLIT_NONASCII)))
&& ((outchar - &obj->buff[0]) <= (MAX_PATTERN_LENGTH / 2)))
{
dfa = TRUE;
last_leaf_mask = 0;
leaf_num = 0;
sym_num = 0;
min_dfa = 0;
atom_map = count;
memset(expand.num_e, 0, SIZEOF(expand.num_e));
}
if (!dfa)
{
if (count >= MAX_PATTERN_ATOMS ||
!add_atom(&count, pattern_mask, &strlit, infinite,
&min[count], &max[count], &size[count],
&total_min, &total_max, lower_bound, upper_bound, altmin, altmax,
&last_infinite, &fstchar, &outchar, &lastpatptr))
{
instr->addr = (char *)inchar;
assert(FALSE);
return ERR_PATMAXLEN;
}
if (pattern_mask & PATM_ALT)
{ /* If the alternation contains only one alternative (altcount == 1) AND
* that alternative contains only one pattern atom, AND that atom is not an
* alternation or a DFA, the alternation can be reduced to that atom.
* The boundaries of the compressed atom will be the products of the
* boundaries of the alternation and those of the atom within the alternation
* (lower*lower and upper*upper) E.g. 10.20(.5AN) is the same as .100AN
*
* Such a simplification can be made if:
* => the inner lower limit is 0 or 1
* e.g. 2.3(0.2L) = 0.6L
* 0 = 0 + 0 + 0
* 1 = 0 + 0 + 1
* 2 = 0 + 1 + 1
* 3 = 1 + 1 + 1
* 4 = 1 + 1 + 2
* 5 = 1 + 2 + 2
* 6 = 2 + 2 + 2
* e.g. 2.3(1.2L) = 2.6L
* 2 = 1 + 1
* 3 = 1 + 1 + 1
* 4 = 1 + 1 + 2
* 5 = 1 + 2 + 2
* 6 = 2 + 2 + 2
* Note that lower limit 1 case is the same as the lower limit 0 case
* except for not counting "0" as one match.
*
* => or the outer lower and upper limit are the same (not a range)
* e.g. 4(4.5L) = 16.20L
* 16 = 4 + 4 + 4 + 4
* 17 = 4 + 4 + 4 + 5
* 18 = 4 + 4 + 5 + 5
* 19 = 4 + 5 + 5 + 5
* 20 = 5 + 5 + 5 + 5
*/
altsimplify = ((1 == altcount) && cur_alt) ? TRUE : FALSE;
if (altsimplify && (cur_alt->altpat.buff[PAT_MASK_BEGIN_OFFSET] & (PATM_ALT | PATM_DFA)))
altsimplify = FALSE;
if (altsimplify)
{
jump = cur_alt->altpat.buff[PAT_LEN_OFFSET];
if (1 != cur_alt->altpat.buff[PAT_COUNT_OFFSET(jump)])
altsimplify = FALSE;
}
if (altsimplify)
{
assert(0 == cur_alt->altpat.buff[0] || 1 == cur_alt->altpat.buff[0]);
size_in = cur_alt->altpat.buff[
PAT_SIZE_BEGIN_OFFSET(cur_alt->altpat.buff[0], jump, 1)];
high_in = cur_alt->altpat.buff[
PAT_MAX_BEGIN_OFFSET(cur_alt->altpat.buff[0], jump, 1)];
low_in = cur_alt->altpat.buff[
PAT_MIN_BEGIN_OFFSET(cur_alt->altpat.buff[0], jump, 1)];
if (!size_in)
size_in = 1;
if ((1 != low_in) && (0 != low_in) && (lower_bound != upper_bound))
altsimplify = FALSE;
}
if (altsimplify)
{
size[count - 1] = size_in;
min[count - 1] = BOUND_MULTIPLY(low_in, lower_bound, bound);
lower_bound = min[count - 1];
if (!cur_alt->altpat.buff[0])
fixed_len = FALSE;
if (!fixed_len)
{
max[count - 1] = BOUND_MULTIPLY(high_in, upper_bound, bound);
upper_bound = max[count - 1];
}
outchar--;
for (seq = PAT_MASK_BEGIN_OFFSET; seq <= jump; seq++)
*outchar++ = cur_alt->altpat.buff[seq];
pattern_mask = cur_alt->altpat.buff[PAT_MASK_BEGIN_OFFSET];
if (pattern_mask & PATM_STRLIT)
{
assert(3 == PAT_STRLIT_PADDING);
strlit.bytelen = cur_alt->altpat.buff[PAT_MASK_BEGIN_OFFSET + 1];
strlit.charlen = cur_alt->altpat.buff[PAT_MASK_BEGIN_OFFSET + 2];
strlit.flags = cur_alt->altpat.buff[PAT_MASK_BEGIN_OFFSET + 3];
memcpy(strlit.buff, &cur_alt->altpat.buff[PAT_MASK_BEGIN_OFFSET
+ PAT_STRLIT_PADDING + 1], strlit.bytelen);
}
} else
{
fixed_len = FALSE;
*outchar++ = lower_bound;
*outchar++ = upper_bound;
for (cur_alt = &init_alt; cur_alt; )
{
*outchar++ = cur_alt->altpat.len;
for (seq = 0; seq < cur_alt->altpat.len; seq++)
*outchar++ = cur_alt->altpat.buff[seq];
cur_alt = (alternation *)cur_alt->next;
}
*outchar++ = 0;
done_free = TRUE;
}
}
} else
{
leafcnt = charpos = MAX(lower_bound, 1);
if ((pattern_mask & PATM_STRLIT) && !(strlit.flags & PATM_STRLIT_NONASCII))
{
assert(strlit.charlen == strlit.bytelen);
charpos *= strlit.bytelen;
leafcnt = MAX(charpos, leafcnt);
}
if ((lower_bound > MAX_DFA_REP)
|| ((pattern_mask & PATM_STRLIT) && (strlit.flags & PATM_STRLIT_NONASCII))
|| (!infinite && lower_bound != upper_bound)
|| ((leaf_num + leafcnt) >= (MAX_SYM - 1))
|| (charpos > MAX_DFA_STRLEN))
{
patmaskptr = outchar;
cursize = dfa_calc(lv_ptr, leaf_num, exp_ptr, &fstchar, &outchar);
if (cursize >= 0)
{
min[count] = min_dfa;
max[count] = PAT_MAX_REPEAT;
size[count] = cursize;
total_min += BOUND_MULTIPLY(min[count], size[count], bound);
if (total_min > PAT_MAX_REPEAT)
total_min = PAT_MAX_REPEAT;
total_max += BOUND_MULTIPLY(max[count], size[count], bound);
if (total_max > PAT_MAX_REPEAT)
total_max = PAT_MAX_REPEAT;
lastpatptr = patmaskptr;
last_infinite = TRUE;
count++;
} else
{
outchar = patmaskptr;
if (!pat_unwind(&count, lv_ptr, leaf_num, &total_min, &total_max,
&min[0], &max[0], &size[0], altmin, altmax,
&last_infinite, &fstchar, &outchar, &lastpatptr))
{
instr->addr = (char *)inchar;
assert(FALSE);
return ERR_PATMAXLEN;
}
}
dfa = FALSE;
done = FALSE;
continue;
} else
{
curr_min_dfa = min_dfa;
curr_leaf_num = leaf_num;
if (pattern_mask & PATM_STRLIT)
{
memset(&exp_temp[0], 0, SIZEOF(exp_temp));
min[atom_map] = lower_bound;
max[atom_map] = upper_bound;
size[atom_map] = strlit.bytelen;
atom_map++;
min_dfa += lower_bound * strlit.bytelen;
cursize = MAX(lower_bound, 1);
for (seqcnt = 0; seqcnt < cursize; seqcnt++)
{
for (charpos = 0; charpos < strlit.bytelen; charpos++)
{
symbol = strlit.buff[charpos];
/* It is ok to use typemask[] below because we are guaranteed
* that "symbol" is a 1-byte valid ASCII character. Assert that.
*/
assert(!(strlit.flags & PATM_STRLIT_NONASCII) && IS_ASCII(symbol));
bitpos = patmaskseq(typemask[symbol]);
if (expand.num_e[bitpos] + exp_temp[bitpos] == 0)
exp_temp[bitpos]++;
for (leafcnt = 1;
leafcnt < expand.num_e[bitpos] + exp_temp[bitpos] &&
expand.meta_c[bitpos][leafcnt] != symbol;
leafcnt++)
;
if (leafcnt == expand.num_e[bitpos] + exp_temp[bitpos])
exp_temp[bitpos]++;
expand.meta_c[bitpos][leafcnt] = symbol;
if (!infinite)
{
leaves.letter[leaf_num][0] = symbol;
leaves.letter[leaf_num][1] = -1;
leaves.nullable[leaf_num++] = FALSE;
} else
leaves.letter[leaf_num][charpos] = symbol;
}
if (infinite)
{
leaves.letter[leaf_num][charpos] = -1;
leaves.nullable[leaf_num++] = infinite;
}
}
last_leaf_mask = PATM_STRLIT;
last_infinite = infinite;
sym_num = 0;
for (leafcnt = 0; leafcnt < leaf_num; leafcnt++)
{
for (charpos = 0; leaves.letter[leafcnt][charpos] >= 0; charpos++)
{
if (!(leaves.letter[leafcnt][charpos] & DFABIT))
sym_num++;
else
{
bitpos = patmaskseq(leaves.letter[leafcnt][charpos]);
sym_num += expand.num_e[bitpos] + exp_temp[bitpos];
}
}
}
} else
{
if (!(last_leaf_mask & PATM_STRLIT) && infinite && last_infinite)
{
y_max = MAX(pattern_mask, last_leaf_mask);
if ((last_leaf_mask & pattern_mask) &&
((last_leaf_mask | pattern_mask) == y_max))
{
if (last_leaf_mask == y_max)
continue;
leaf_num--;
atom_map--;
}
}
min[atom_map] = lower_bound;
max[atom_map] = upper_bound;
size[atom_map] = 1;
atom_map++;
min_dfa += lower_bound;
charpos = MAX(lower_bound, 1);
last_infinite = infinite;
last_leaf_mask = pattern_mask;
for (seqcnt = 0; seqcnt < charpos; seqcnt++)
{
bitpos = 0;
leaves.nullable[leaf_num] = infinite;
/* Check all PAT_MAX_BITS bits if there are flags for internationalization,
* otherwise, check only the original PAT_BASIC_CLASSES bits
* (C, L, N, P, U, 0, 1) where
* 0 = PATM_UTF8_ALPHABET, 1 = PATM_UTF8_NONBASIC
*/
if (PATM_E != pattern_mask)
{
chidx = (pattern_mask & PATM_I18NFLAGS)
? PAT_MAX_BITS : PAT_BASIC_CLASSES;
} else
chidx = PAT_BASIC_CLASSES;
for (bit = 0; bit < chidx; bit++)
{
mbit = 1 << bit;
if ((allmask & mbit) && (pattern_mask & mbit))
{
seq = patmaskseq((uint4)mbit);
if (expand.num_e[seq] == 0)
expand.num_e[seq]++;
sym_num += expand.num_e[seq];
assert(MAX_DFA_STRLEN >= bitpos);
leaves.letter[leaf_num][bitpos++] = DFABIT | mbit;
}
}
assert(MAX_DFA_STRLEN >= bitpos);
leaves.letter[leaf_num][bitpos] = -1;
leaf_num++;
}
}
}
if (sym_num >= MAX_SYM - 1)
{
patmaskptr = outchar;
cursize = dfa_calc(lv_ptr, curr_leaf_num, exp_ptr, &fstchar, &outchar);
if (cursize >= 0)
{
min[count] = curr_min_dfa;
max[count] = PAT_MAX_REPEAT;
size[count] = cursize;
total_min += BOUND_MULTIPLY(min[count], size[count], bound);
if (total_min > PAT_MAX_REPEAT)
total_min = PAT_MAX_REPEAT;
total_max += BOUND_MULTIPLY(max[count], size[count], bound);
if (total_max > PAT_MAX_REPEAT)
total_max = PAT_MAX_REPEAT;
lastpatptr = patmaskptr;
last_infinite = TRUE;
count++;
} else
{
outchar = patmaskptr;
if (!pat_unwind(&count, lv_ptr, curr_leaf_num, &total_min, &total_max,
&min[0], &max[0], &size[0], altmin, altmax,
&last_infinite, &fstchar, &outchar, &lastpatptr))
{
instr->addr = (char *)inchar;
assert(FALSE);
return ERR_PATMAXLEN;
}
}
dfa = FALSE;
done = FALSE;
continue;
} else
{
if (last_leaf_mask & PATM_STRLIT)
for (seqcnt = 0; seqcnt < CHAR_CLASSES; seqcnt++)
expand.num_e[seqcnt] += exp_temp[seqcnt];
}
}
if (split_atom)
{
lower_bound = 0;
upper_bound = PAT_MAX_REPEAT;
infinite = TRUE;
split_atom = FALSE;
done = FALSE;
}
}
for (cur_alt = &init_alt; cur_alt; )
{
let_go = (cur_alt != (alternation *)&init_alt) ? (unsigned char *)cur_alt : NULL;
cur_alt = (alternation *)cur_alt->next;
if (let_go)
free(let_go);
}
init_alt.next = NULL;
init_alt.altpat.len = 0;
}
}