340 lines
14 KiB
C
340 lines
14 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2001, 2009 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
|
|
#include "gtm_string.h" /* for memset */
|
|
|
|
#include "copy.h"
|
|
#include "patcode.h"
|
|
|
|
#ifdef UNICODE_SUPPORTED
|
|
#include "gtm_utf8.h"
|
|
#endif
|
|
|
|
/* see corresponding GBLDEFs in gbldefs.c for comments on the caching mechanism */
|
|
GBLREF int4 curalt_depth; /* depth of alternation nesting */
|
|
GBLREF int4 do_patalt_calls[PTE_MAX_CURALT_DEPTH]; /* number of calls to do_patalt() */
|
|
GBLREF int4 do_patalt_hits[PTE_MAX_CURALT_DEPTH]; /* number of pte_csh hits in do_patalt() */
|
|
GBLREF int4 do_patalt_maxed_out[PTE_MAX_CURALT_DEPTH]; /* no. of pte_csh misses after maxing on allocation size */
|
|
GBLREF pte_csh *pte_csh_array[PTE_MAX_CURALT_DEPTH]; /* pte_csh array (per curalt_depth) */
|
|
GBLREF int4 pte_csh_cur_size[PTE_MAX_CURALT_DEPTH]; /* current pte_csh size (per curalt_depth) */
|
|
GBLREF int4 pte_csh_alloc_size[PTE_MAX_CURALT_DEPTH]; /* current allocated pte_csh size (per curalt_depth) */
|
|
GBLREF int4 pte_csh_entries_per_len[PTE_MAX_CURALT_DEPTH]; /* current number of entries per len */
|
|
GBLREF int4 pte_csh_tail_count[PTE_MAX_CURALT_DEPTH]; /* count of non 1-1 corresponding pte_csh_array members */
|
|
GBLREF pte_csh *cur_pte_csh_array; /* copy of pte_csh_array corresponding to curalt_depth */
|
|
GBLREF int4 cur_pte_csh_size; /* copy of pte_csh_cur_size corresponding to curalt_depth */
|
|
GBLREF int4 cur_pte_csh_entries_per_len; /* copy of pte_csh_entries_per_len corresponding to curalt_depth */
|
|
GBLREF int4 cur_pte_csh_tail_count; /* copy of pte_csh_tail_count corresponding to curalt_depth */
|
|
GBLREF boolean_t gtm_utf8_mode;
|
|
|
|
/* Example compiled pattern for an alternation pattern
|
|
* Pattern = P0_P1
|
|
* ----------------
|
|
* P0 = 1.3(.N,2"-",.2A)
|
|
* P1 = 1" "
|
|
*
|
|
* Compiled Pattern
|
|
* -----------------
|
|
* 0x00000000 <-- fixed (1 if fixed length, 0 if not fixed length)
|
|
* 0x00000027 <-- length of pattern stream (inclusive of itself)
|
|
*
|
|
* 0x02000000 P0 <-- pattern_mask[0] => alternation
|
|
* 0x00000001 P0 <-- alt_rep_min[0]
|
|
* 0x00000003 P0 <-- alt_rep_max[0]
|
|
* 0x00000009 P0 <-- length of alternation pattern's choice[0] pattern (exclusive of itself)
|
|
* 0x00000000 P0 <-- fixed
|
|
* 0x00000002 P0 <-- length of pattern stream (inclusive of itself)
|
|
* 0x40000001 P0 <-- pattern_mask[0] => DFABIT | PATM_N
|
|
* 0x00000001 P0 <-- count
|
|
* 0x00000000 P0 <-- tot_min
|
|
* 0x00007fff P0 <-- tot_max
|
|
* 0x00000000 P0 <-- min[0]
|
|
* 0x00007fff P0 <-- max[0]
|
|
* 0x00000001 P0 <-- size[0]
|
|
* 0x0000000a P0 <-- length of alternation pattern's choice[1] pattern (exclusive of itself)
|
|
* 0x00000001 P0 <-- fixed
|
|
* 0x00000004 P0 <-- length of pattern stream (inclusive of itself)
|
|
* 0x00000082 P0 <-- pattern_mask[0] = PATM_STR | PATM_P
|
|
* 0x00000001 P0 <-- length of PATM_STR (exclusive of itself)
|
|
* 0x0000002d P0 <-- PATM_STR[0] = '-'
|
|
* 0x00000001 P0 <-- count
|
|
* 0x00000002 P0 <-- tot_min
|
|
* 0x00000002 P0 <-- tot_max
|
|
* 0x00000002 P0 <-- min[0] // Note for fixed length, max[] array is absent //
|
|
* 0x00000001 P0 <-- size[0]
|
|
* 0x00000000 P0 <-- End of alternation pattern's choices ('\0')
|
|
*
|
|
* 0x00000082 P1 <-- pattern_mask[1] => PATM_STR | PATM_P (' ')
|
|
* 0x00000001 P1 <-- length of PATM_STR (exclusive of itself)
|
|
* 0x00000020 P1 <-- PATM_STR[0] = ' '
|
|
*
|
|
* 0x00000002 <-- count
|
|
* 0x00000001 <-- total_min
|
|
* 0x00007fff <-- total_max
|
|
* 0x00000000 <-- min[0] <-- Begin of min[2] array
|
|
* 0x00000001 <-- min[1]
|
|
* 0x00007fff <-- max[0] <-- Begin of max[2] array
|
|
* 0x00000001 <-- max[1]
|
|
* 0x00000001 <-- size[0] <-- Begin of size[2] array
|
|
* 0x00000001 <-- size[1]
|
|
*/
|
|
|
|
/* returns index in cur_pte_csh_array that holds the desired <patstr, strptr, charlen, repcnt> tuple..
|
|
* return PTE_NOT_FOUND otherwise.
|
|
*/
|
|
static int pte_csh_present(char *patptr, char *strptr, int4 charlen, int repcnt)
|
|
{
|
|
int4 index;
|
|
pte_csh *tmp_pte, *pte_top;
|
|
|
|
assert(PTE_MAX_CURALT_DEPTH > curalt_depth);
|
|
index = ((PTE_STRLEN_CUTOFF > charlen) ? charlen : PTE_STRLEN_CUTOFF) * cur_pte_csh_entries_per_len;
|
|
assert(cur_pte_csh_size > index);
|
|
tmp_pte = cur_pte_csh_array + index;
|
|
pte_top = tmp_pte + ((PTE_STRLEN_CUTOFF > charlen) ? cur_pte_csh_entries_per_len : cur_pte_csh_tail_count);
|
|
assert(pte_top <= (cur_pte_csh_array + cur_pte_csh_size));
|
|
for (; tmp_pte < pte_top; tmp_pte++)
|
|
{
|
|
if ((tmp_pte->strptr != strptr) || (tmp_pte->patptr != patptr)
|
|
|| (tmp_pte->charlen != charlen) || (tmp_pte->repcnt != repcnt))
|
|
{
|
|
if (NULL != tmp_pte->strptr)
|
|
continue;
|
|
else
|
|
break; /* the first NULL value means all further entries for this "charlen" are NULL */
|
|
}
|
|
tmp_pte->count++;
|
|
return (int)tmp_pte->match;
|
|
}
|
|
return (int)PTE_NOT_FOUND;
|
|
}
|
|
|
|
static void pte_csh_insert(char *patptr, char *strptr, int4 charlen, int repcnt, boolean_t match)
|
|
{
|
|
int4 index;
|
|
pte_csh *tmp_pte, *pte_top, *min_pte, *free_pte;
|
|
|
|
assert(PTE_MAX_CURALT_DEPTH > curalt_depth);
|
|
assert(PTE_NOT_FOUND == pte_csh_present(patptr, strptr, charlen, repcnt));
|
|
index = ((PTE_STRLEN_CUTOFF > charlen) ? charlen : PTE_STRLEN_CUTOFF) * cur_pte_csh_entries_per_len;
|
|
assert(cur_pte_csh_size > index);
|
|
tmp_pte = cur_pte_csh_array + index;
|
|
pte_top = tmp_pte + ((PTE_STRLEN_CUTOFF > charlen) ? cur_pte_csh_entries_per_len : cur_pte_csh_tail_count);
|
|
assert(pte_top <= (cur_pte_csh_array + cur_pte_csh_size));
|
|
min_pte = tmp_pte;
|
|
free_pte = NULL;
|
|
|
|
for (; tmp_pte < pte_top; tmp_pte++)
|
|
{
|
|
if (NULL == tmp_pte->patptr)
|
|
{
|
|
min_pte = free_pte = tmp_pte;
|
|
break;
|
|
} else if (min_pte->count > tmp_pte->count)
|
|
min_pte = tmp_pte;
|
|
}
|
|
if (NULL == free_pte)
|
|
{
|
|
for (tmp_pte = cur_pte_csh_array + index; tmp_pte < pte_top; tmp_pte++)
|
|
tmp_pte->count = 1; /* reset count whenever new entry is made thereby causing history refresh.
|
|
* i.e. permitting formerly busy but currently inactive patterns to be reused
|
|
*/
|
|
}
|
|
min_pte->count = 0; /* give little priority to the rest by setting count to 1 less than the others */
|
|
min_pte->patptr = patptr;
|
|
min_pte->strptr = strptr;
|
|
min_pte->charlen = charlen;
|
|
min_pte->repcnt = repcnt;
|
|
min_pte->match = match;
|
|
}
|
|
|
|
int do_patalt(uint4 *firstalt, unsigned char *strptr, unsigned char *strtop, int4 repmin, int4 repmax, int totchar, int repcnt,
|
|
int4 min_incr, int4 max_incr)
|
|
{
|
|
boolean_t fixed;
|
|
int4 alt_tot_min, alt_tot_max, new_pte_csh_size, tmp_do_patalt_calls;
|
|
uint4 *cur_alt, tempuint;
|
|
uint4 *patptr;
|
|
int match, alt_size, charlen, bytelen, pat_found;
|
|
mval alt_pat, alt_str;
|
|
pte_csh *tmp_pte;
|
|
unsigned char *strtmp, *strnext;
|
|
|
|
if (PTE_MAX_CURALT_DEPTH > curalt_depth)
|
|
{ /* try to find it in the current pattern evaluation cache (cur_pte_csh_array) itself */
|
|
tmp_do_patalt_calls = ++do_patalt_calls[curalt_depth];
|
|
pat_found = pte_csh_present((char *)firstalt, (char *)strptr, totchar, repcnt);
|
|
if (PTE_NOT_FOUND != pat_found)
|
|
{
|
|
do_patalt_hits[curalt_depth]++;
|
|
return pat_found;
|
|
} else if ((tmp_do_patalt_calls > cur_pte_csh_size)
|
|
&& ((tmp_do_patalt_calls - do_patalt_hits[curalt_depth]) > (tmp_do_patalt_calls / PTE_CSH_MISS_FACTOR)))
|
|
{ /* lots of cache miss happening. try to increase pt_csh_array size */
|
|
do_patalt_hits[curalt_depth] = do_patalt_calls[curalt_depth] = 1;
|
|
new_pte_csh_size = cur_pte_csh_size;
|
|
if (cur_pte_csh_size < pte_csh_alloc_size[curalt_depth])
|
|
{
|
|
new_pte_csh_size = (cur_pte_csh_size << 1);
|
|
assert(cur_pte_csh_size <= pte_csh_alloc_size[curalt_depth]);
|
|
} else if (PTE_MAX_ENTRIES > pte_csh_alloc_size[curalt_depth])
|
|
{
|
|
new_pte_csh_size = (cur_pte_csh_size << 1);
|
|
tmp_pte = malloc(SIZEOF(pte_csh) * new_pte_csh_size);
|
|
free(cur_pte_csh_array);
|
|
pte_csh_alloc_size[curalt_depth] = new_pte_csh_size;
|
|
pte_csh_array[curalt_depth] = tmp_pte;
|
|
cur_pte_csh_array = pte_csh_array[curalt_depth];
|
|
} else
|
|
do_patalt_maxed_out[curalt_depth]++;
|
|
if (new_pte_csh_size != cur_pte_csh_size)
|
|
{
|
|
memset(pte_csh_array[curalt_depth], 0, SIZEOF(pte_csh) * new_pte_csh_size);
|
|
pte_csh_cur_size[curalt_depth] *= 2;
|
|
pte_csh_entries_per_len[curalt_depth] *= 2;
|
|
pte_csh_tail_count[curalt_depth] *= 2;
|
|
UPDATE_CUR_PTE_CSH_MINUS_ARRAY(cur_pte_csh_size,
|
|
cur_pte_csh_entries_per_len, cur_pte_csh_tail_count);
|
|
}
|
|
}
|
|
}
|
|
alt_pat.mvtype = MV_STR;
|
|
alt_str.mvtype = MV_STR;
|
|
alt_str.str.addr = (char *)strptr;
|
|
patptr = firstalt;
|
|
GET_LONG(alt_size, patptr);
|
|
patptr++;
|
|
for (match = FALSE; !match && alt_size; patptr++)
|
|
{
|
|
cur_alt = patptr;
|
|
cur_alt++;
|
|
GET_ULONG(tempuint, cur_alt);
|
|
cur_alt++;
|
|
cur_alt += tempuint;
|
|
GET_LONG(alt_tot_min, cur_alt);
|
|
cur_alt++;
|
|
if (alt_tot_min <= totchar)
|
|
{
|
|
GET_LONG(tempuint, cur_alt);
|
|
GET_LONG(fixed, patptr);
|
|
/* Note that some patterns whose minimum and maximum length are the same need not have
|
|
* "fixed" field 1. This is because alternations which have choices that all evaluate
|
|
* to the same length (e.g. 5(2l,2e,"ab")) are currently not recognizable by do_patfixed
|
|
* and hence go through do_pattern.
|
|
*/
|
|
assert(!fixed || (alt_tot_min == tempuint));
|
|
alt_tot_max = (tempuint < totchar) ? tempuint : totchar;
|
|
alt_pat.str.addr = (char *)patptr;
|
|
alt_pat.str.len = alt_size * SIZEOF(uint4);
|
|
/* Note that the below zero min length avoiding code is actually an optimization.
|
|
* This is because if we start from length 0, we will end up matching the input string and in case
|
|
* the alternation pattern's max count is huge (e.g. PAT_MAX) we will end up recursing
|
|
* in do_patalt() as many times each time matching a length of 0, without realizing we are
|
|
* not progressing anywhere in the match by matching a huge number of empty strings.
|
|
* This will effectively cause a combinatorial explosion to occur in case there are at least 2 choices
|
|
* in the alternation pattern (which usually will be the case) since the choices that need to be
|
|
* examined are 2 ** PAT_MAX.
|
|
* Instead, if we start from length 1, every level of recursion we decrease the size of the problem
|
|
* by matching a non-zero length of the input string and hence we can't progress much in the
|
|
* recursion levels before starting to backtrack, thereby avoiding the explosion.
|
|
* Note that we do have to consider zero length in case we haven't yet exhausted our minimum count of
|
|
* the alternation pattern and we have a null input string remaining to be matched.
|
|
* Hence the if check below.
|
|
*/
|
|
if (totchar && (0 == alt_tot_min))
|
|
alt_tot_min = 1; /* avoid zero min length when non-zero string still needs to be matched */
|
|
if (!gtm_utf8_mode)
|
|
{ /* each character is 1 byte so charlen and bytelen is same */
|
|
charlen = alt_tot_min;
|
|
bytelen = alt_tot_min;
|
|
}
|
|
UNICODE_ONLY(
|
|
else
|
|
{ /* skip alt_tot_min characters */
|
|
strtmp = strptr;
|
|
for (charlen = 0; charlen < alt_tot_min; charlen++)
|
|
{
|
|
assert(strtmp < strtop);
|
|
strtmp = UTF8_MBNEXT(strtmp, strtop);
|
|
}
|
|
bytelen = (int)(strtmp - strptr);
|
|
}
|
|
)
|
|
UNICODE_ONLY(
|
|
if (gtm_utf8_mode)
|
|
alt_str.mvtype |= MV_UTF_LEN; /* avoid recomputing "char_len" in do_pattern/do_patfixed */
|
|
)
|
|
for ( ; !match && (charlen <= alt_tot_max); charlen++)
|
|
{
|
|
alt_str.str.len = bytelen;
|
|
UNICODE_ONLY(
|
|
if (gtm_utf8_mode)
|
|
{
|
|
assert(utf8_len(&alt_str.str) == charlen);
|
|
alt_str.str.char_len = charlen; /* set "char_len" */
|
|
}
|
|
)
|
|
match = charlen ? (fixed ? do_patfixed(&alt_str, &alt_pat)
|
|
: do_pattern(&alt_str, &alt_pat))
|
|
: TRUE;
|
|
/* max_incr and min_incr aid us in an earlier backtracking optimization.
|
|
* for example, let us consider "abcdefghijklmnopqrstuvwxyz"?.13(1l,1e,1n,1u,1p,2l)
|
|
* say the first do_patalt() call matches a substring (the beginning of the input string) "a"
|
|
* with the first alternation choice 1l
|
|
* say the recursive second do_patalt() call then matches a substring of the now beginning
|
|
* input string "b" with the first alternation choice 1l again
|
|
* the recursively called third do_patalt() now can rest assured that the remaining string
|
|
* can't be matched by the alternation. This is because it has only 11 chances left
|
|
* (note the maximum is .13) and each time the maximum length it can match is 2 (the
|
|
* maximum length of all the alternation choices which is 2l) which leaves it with a
|
|
* maximum of 22 characters while there are still 24 characters left in the input-string.
|
|
* this optimization can cause a backtracking to occur at the 3rd level of call to do_patalt()
|
|
* instead of going through the call trace 13 times and then determining at the leaf level.
|
|
* since at each level, the choices examined are 6, we are saving nearly (6 to the power of 11)
|
|
* choice examinations (11 for the levels that we avoid with the optimization)
|
|
*/
|
|
if (match && ((charlen < totchar) || (repcnt < repmin)))
|
|
match &= ((repcnt < repmax)
|
|
&& ((totchar - charlen) <= (repmax - repcnt) * max_incr)
|
|
&& ((totchar - charlen) >= (repmin - repcnt) * min_incr))
|
|
? do_patalt(firstalt, &strptr[bytelen], strtop, repmin, repmax,
|
|
totchar - charlen, repcnt + 1, min_incr, max_incr)
|
|
: FALSE;
|
|
if (!match)
|
|
{ /* update "bytelen" to correspond to "charlen + 1" */
|
|
if (!gtm_utf8_mode)
|
|
bytelen++;
|
|
UNICODE_ONLY(
|
|
else
|
|
{
|
|
assert((strtmp < strtop) || (charlen == alt_tot_max));
|
|
if (strtmp < strtop)
|
|
{
|
|
strnext = UTF8_MBNEXT(strtmp, strtop);
|
|
assert(strnext > strtmp);
|
|
bytelen += (int)(strnext - strtmp);
|
|
strtmp = strnext;
|
|
}
|
|
}
|
|
)
|
|
}
|
|
}
|
|
}
|
|
patptr += alt_size;
|
|
GET_LONG(alt_size, patptr);
|
|
}
|
|
if (PTE_MAX_CURALT_DEPTH > curalt_depth)
|
|
pte_csh_insert((char *)firstalt, (char *)strptr, totchar, repcnt, match);
|
|
return match;
|
|
}
|
|
|