fis-gtm/sr_port/do_pattern.c

470 lines
14 KiB
C

/****************************************************************
* *
* Copyright 2001, 2009 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "patcode.h"
#include "copy.h"
#include "min_max.h"
#ifdef UNICODE_SUPPORTED
#include "gtm_icu_api.h" /* needed by *TYPEMASK* macros defined in gtm_utf8.h */
#include "gtm_utf8.h"
#endif
GBLREF uint4 pat_allmaskbits;
GBLREF uint4 *pattern_typemask;
GBLREF char codelist[];
GBLREF int4 curalt_depth; /* depth of alternation nesting */
GBLREF int4 do_patalt_calls[PTE_MAX_CURALT_DEPTH]; /* number of calls to do_patalt() */
GBLREF int4 do_patalt_hits[PTE_MAX_CURALT_DEPTH]; /* number of pte_csh hits in do_patalt() */
GBLREF pte_csh *pte_csh_array[PTE_MAX_CURALT_DEPTH]; /* pte_csh array (per curalt_depth) */
GBLREF int4 pte_csh_cur_size[PTE_MAX_CURALT_DEPTH]; /* current pte_csh size (per curalt_depth) */
GBLREF int4 pte_csh_alloc_size[PTE_MAX_CURALT_DEPTH]; /* current allocated pte_csh size (per curalt_depth) */
GBLREF int4 pte_csh_entries_per_len[PTE_MAX_CURALT_DEPTH]; /* current number of entries per len */
GBLREF int4 pte_csh_tail_count[PTE_MAX_CURALT_DEPTH]; /* count of non 1-1 corresponding pte_csh_array members */
GBLREF pte_csh *cur_pte_csh_array; /* copy of pte_csh_array corresponding to curalt_depth */
GBLREF int4 cur_pte_csh_size; /* copy of pte_csh_cur_size corresponding to curalt_depth */
GBLREF int4 cur_pte_csh_entries_per_len; /* copy of pte_csh_entries_per_len corresponding to curalt_depth */
GBLREF int4 cur_pte_csh_tail_count; /* copy of pte_csh_tail_count corresponding to curalt_depth */
GBLREF boolean_t gtm_utf8_mode;
/* This procedure executes at "run-time". After a pattern in a MUMPS program has been compiled (by patstr and
* its helper-procedures), this procedure can be called to evaluate "variable-length" patterns.
* Variable-length patterns are of the kind 3.5N2.A.5N i.e. for at least one pattern atom,
* the lower-bound is different from the upper-bound.
* For patterns with a fixed length, procedure do_patfixed() will be called to do the evaluation.
* Variable length input patterns will be scanned at runtime to see if they have at least one fixed length pattern atom.
* If yes, routine do_patsplit() will be invoked to determine the fixed length sub pattern atom that is closest to the
* median of the pattern atoms. The input pattern would then be split into three, left, fixed and right.
* The fixed pattern can be matched by a linear scan in the input string. Once that is done, the left and right
* pattern atom positions are pivoted relative to the input string and do_pattern() is invoked on each of them recursively.
* If no fixed length pattern atoms can be found, then do_pattern() calculates all possible permutations (it has certain
* optimizations to prune the combinatorial search tree) and tries to see for each permutation if a match occurs.
*/
int do_pattern(mval *str, mval *pat)
{
int4 count, total_min, total_max;
int4 alt_rep_min, alt_rep_max, min_incr, max_incr;
int4 bytelen, charlen, length, pbytelen, strbytelen;
boolean_t success, attempt, pvalid, strvalid;
int atom, unit, idx, index, hasfixed;
uint4 z_diff, *rpt, *rtop, rept;
uint4 repeat[MAX_PATTERN_ATOMS];
uint4 *patidx[MAX_PATTERN_ATOMS];
unsigned char *stridx[MAX_PATTERN_ATOMS];
unsigned char *strptr, *strtop, *strnext, *pstr, *ptop, *pnext;
uint4 code, tempuint;
uint4 *dfa_ptr, dfa_val;
uint4 *patptr;
uint4 mbit, flags;
int4 *min, *max, *size;
int4 mintmp, maxtmp, sizetmp;
int alt, bit;
char buf[CHAR_CLASSES];
boolean_t pte_csh_init;
boolean_t match;
UNICODE_ONLY(
wint_t utf8_codepoint;
)
error_def(ERR_PATNOTFOUND);
/* set up information */
MV_FORCE_STR(str);
patptr = (uint4 *) pat->str.addr;
GET_ULONG(tempuint, patptr);
if (tempuint)
{ /* tempuint non-zero implies fixed length pattern string. this in turn implies we are not called from op_pattern.s
* but instead called from op_fnzsearch(), gvzwr_fini(), gvzwr_var(), lvzwr_fini(), lvzwr_var() etc.
* in this case, call do_patfixed() as the code below and code in do_patsplit() assumes we are dealing with a
* variable length pattern string. changing all the callers to call do_patfixed() directly instead of this extra
* redirection was considered, but not felt worth it since the call to do_pattern() is not easily macroizable
* due to the expression-like usage in those places.
*/
return do_patfixed(str, pat);
}
patptr++;
patidx[0] = patptr + 1;
stridx[0] = (unsigned char *)str->str.addr;
strtop = stridx[0] + str->str.len;
GET_ULONG(tempuint, patptr);
patptr += tempuint;
GET_LONG(count, patptr);
patptr++;
GET_LONG(total_min, patptr);
patptr++;
GET_LONG(total_max, patptr);
patptr++;
/* "length" actually denotes character length; Get it from the appropriate field in the mstr */
if (!gtm_utf8_mode)
length = str->str.len;
UNICODE_ONLY(
else
{
MV_FORCE_LEN(str); /* to set str.char_len if not already done; also issues BADCHAR error if appropriate */
length = str->str.char_len;
}
)
if (length < total_min || length > total_max)
return FALSE;
min = (int4 *)patptr;
patptr += count;
max = (int4 *)patptr;
patptr += count;
if (MIN_SPLIT_N_MATCH_COUNT <= count)
{
hasfixed = FALSE;
for (index = 0; index < count; index++)
{
GET_LONG(maxtmp, max + index);
GET_LONG(mintmp, min + index);
if (maxtmp == mintmp)
{
hasfixed = TRUE;
break;
}
}
if (hasfixed && (DO_PATSPLIT_FAIL != (match = do_patsplit(str, pat))))
return match;
}
size = (int4 *)patptr;
memcpy(repeat, min, count * SIZEOF(*min));
rtop = &repeat[0] + count;
count--;
attempt = FALSE;
idx = 0;
pte_csh_init = FALSE;
/* proceed to check string */
for (;;)
{
if (total_min == length)
{ /* attempt a match */
attempt = TRUE;
strptr = stridx[idx];
patptr = patidx[idx];
rpt = &repeat[idx];
for (; rpt < rtop; rpt++)
{
GET_ULONG(code, patptr);
patptr++;
rept = *rpt;
if (code & PATM_ALT)
{ /* pattern alternation */
GET_LONG(alt_rep_min, patptr);
patptr++;
GET_LONG(alt_rep_max, patptr);
patptr++;
GET_LONG(maxtmp, max + idx);
GET_LONG(mintmp, min + idx);
assert(alt_rep_min || !mintmp);
assert(alt_rep_max || !maxtmp);
min_incr = mintmp ? mintmp / alt_rep_min : 0;
max_incr = maxtmp ? maxtmp / alt_rep_max : 0;
if (rept)
{
if (FALSE == pte_csh_init)
{
PTE_CSH_INCR_CURALT_DEPTH(curalt_depth);
pte_csh_init = TRUE;
}
if (do_patalt(patptr, strptr, strtop, alt_rep_min, alt_rep_max, rept, 1,
min_incr, max_incr))
{
if (!gtm_utf8_mode)
strptr += rept;
UNICODE_ONLY(
else
{
for (unit = 0; unit < rept; unit++)
{
assert(strptr < strtop); /* below macro relies on this */
strptr = UTF8_MBNEXT(strptr, strtop);
}
}
)
} else
goto CALC;
}
/* make sure that patptr points to the next patcode after the alternation */
GET_LONG(alt, patptr);
patptr++;
while(alt)
{
patptr += alt;
GET_LONG(alt, patptr);
patptr++;
}
} else if (!(code & PATM_STRLIT))
{ /* meta character pat atom */
if (!(code & pat_allmaskbits))
{ /* current table has no characters with this pattern code */
bytelen = 0;
for (bit = 0; bit < PAT_MAX_BITS; bit++)
{
mbit = (1 << bit);
if ((mbit & code & PATM_LONGFLAGS) && !(mbit & pat_allmaskbits))
buf[bytelen++] = codelist[patmaskseq(mbit)];
}
rts_error(VARLSTCNT(4) ERR_PATNOTFOUND, 2, bytelen, buf);
}
if (!gtm_utf8_mode)
{
for (unit = 0; unit < rept; unit++)
{
if (!(code & pattern_typemask[*strptr++]))
goto CALC;
}
}
UNICODE_ONLY(
else
{
for (unit = 0; unit < rept; unit++)
{
assert(strptr < strtop); /* PATTERN_TYPEMASK macro relies on this */
if (!(code & PATTERN_TYPEMASK(strptr, strtop, strnext, utf8_codepoint)))
goto CALC;
strptr = strnext;
}
}
)
} else if (code == PATM_DFA)
{ /* Discrete Finite Automaton pat atom */
GET_LONG(bytelen, patptr);
patptr++;
dfa_ptr = patptr;
for (unit = 0; unit < rept; )
{
GET_ULONG(dfa_val, dfa_ptr);
if (!(dfa_val & PATM_STRLIT))
{
if (!gtm_utf8_mode)
{
success = (dfa_val & pattern_typemask[*strptr]);
strnext = strptr + 1;
}
UNICODE_ONLY(
else
{
success = (dfa_val &
PATTERN_TYPEMASK(strptr, strtop, strnext, utf8_codepoint));
}
)
} else
{
dfa_ptr++;
GET_ULONG(dfa_val, dfa_ptr);
/* Only ASCII characters are currently allowed for DFA STRLITs.
* Assert that below.
*/
assert(IS_ASCII(dfa_val));
if (!gtm_utf8_mode)
{
success = (dfa_val == *strptr);
strnext = strptr + 1;
}
UNICODE_ONLY(
else
{
UTF8_VALID(strptr, strtop, strbytelen);
success = ((1 == strbytelen) && (dfa_val == *strptr));
strnext = strptr + strbytelen;
}
)
}
dfa_ptr++;
if (success)
{
GET_ULONG(dfa_val, dfa_ptr);
dfa_ptr = patptr + dfa_val;
strptr = strnext;
unit++;
GET_ULONG(dfa_val, dfa_ptr);
if (dfa_val == PATM_ACS)
break;
} else
{
dfa_ptr++;
GET_ULONG(dfa_val, dfa_ptr);
if ((dfa_val & PATM_DFA) == PATM_DFA)
break;
}
}
if (unit < rept)
goto CALC;
else
{
GET_ULONG(dfa_val, dfa_ptr);
while (dfa_val < PATM_DFA)
{
if (dfa_val & PATM_STRLIT)
dfa_ptr += 3;
else
dfa_ptr += 2;
GET_ULONG(dfa_val, dfa_ptr);
}
if (dfa_val != PATM_ACS)
goto CALC;
}
patptr += bytelen;
} else
{ /* STRLIT pat atom */
assert(3 == PAT_STRLIT_PADDING);
GET_LONG(bytelen, patptr); /* get bytelen */
patptr++;
GET_LONG(charlen, patptr); /* get charlen */
patptr++;
GET_ULONG(flags, patptr); /* get flags */
patptr++;
if (bytelen == 1)
{
if (!gtm_utf8_mode)
{
for (unit = 0; unit < rept; unit++)
{
if (*(unsigned char *)patptr != *strptr++)
goto CALC;
}
}
UNICODE_ONLY(
else
{
for (unit = 0; unit < rept; unit++)
{
if ((1 != (UTF8_VALID(strptr, strtop, strbytelen), strbytelen))
|| (*(unsigned char *)patptr != *strptr++))
goto CALC;
}
}
)
patptr++;
} else if (bytelen > 0)
{
if (!gtm_utf8_mode)
{
ptop = (unsigned char *)patptr + bytelen;
for (unit = 0; unit < rept; unit++)
{
pstr = (unsigned char *)patptr;
while (pstr < ptop)
{
if (*pstr++ != *strptr++)
goto CALC;
}
}
}
UNICODE_ONLY(
else
{
pstr = (unsigned char *)patptr;
ptop = pstr + bytelen;
for (unit = 0; unit < rept; unit++)
{
pstr = (unsigned char *)patptr;
for ( ; pstr < ptop; )
{
pvalid = UTF8_VALID(pstr, ptop, pbytelen);
/* sets pbytelen */
assert(pvalid);
strvalid = UTF8_VALID(strptr, strtop, strbytelen);
/* sets strbytelen */
if (pbytelen != strbytelen)
goto CALC;
DEBUG_ONLY(strnext = strptr + pbytelen);
pnext = pstr + pbytelen;
do
{
if (*pstr++ != *strptr++)
goto CALC;
} while (pstr < pnext);
assert(strptr == strnext);
}
}
}
)
patptr += DIVIDE_ROUND_UP(bytelen, SIZEOF(*patptr));
}
}
idx++;
stridx[idx] = strptr;
patidx[idx] = patptr;
}
if (pte_csh_init)
{ /* surrounded by braces since the following is a multi-line macro */
PTE_CSH_DECR_CURALT_DEPTH(curalt_depth);
}
return TRUE;
} else
{ /* calculate permutations */
attempt = FALSE;
GET_LONG(maxtmp, max + count);
GET_LONG(mintmp, min + count);
GET_LONG(sizetmp, size + count);
if (repeat[count] < maxtmp)
{
atom = unit = length - total_min;
z_diff = maxtmp - mintmp;
if (sizetmp > 1)
{
unit /= sizetmp;
atom = unit * sizetmp;
z_diff *= sizetmp;
}
if (atom > 0)
{
total_min += MIN(atom, z_diff);
repeat[count] = MIN(repeat[count] + unit, maxtmp);
if (total_min == length)
continue;
}
}
}
CALC: unit = count;
GET_LONG(sizetmp, size + unit);
for ( ; ; )
{
GET_LONG(mintmp, min + unit);
total_min -= (repeat[unit] - mintmp) * sizetmp;
repeat[unit] = mintmp;
unit--;
if (unit < 0)
{
if (pte_csh_init)
{ /* surrounded by braces since the following is a multi-line macro */
PTE_CSH_DECR_CURALT_DEPTH(curalt_depth);
}
return FALSE;
}
GET_LONG(maxtmp, max + unit);
GET_LONG(sizetmp, size + unit);
if (repeat[unit] < maxtmp)
{
total_min += sizetmp;
repeat[unit]++;
if (total_min <= length)
{
if (unit <= idx)
{
idx = unit;
break;
}
if (!attempt)
break;
}
}
}
}
}