fis-gtm/sr_port/do_patsplit.c

363 lines
12 KiB
C
Raw Permalink Normal View History

/****************************************************************
* *
* Copyright 2001, 2009 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "patcode.h"
#include "copy.h"
#include "min_max.h"
#include "gtm_string.h"
#ifdef UNICODE_SUPPORTED
#include "gtm_utf8.h"
#endif
GBLREF boolean_t gtm_utf8_mode;
/* This routine tries to split the pattern-string P into substrings LFR where F is a fixed length pattern-atom of P
* and L and R are the pattern substrings in P on the left and right side of F.
* For pattern-strings that have more than one fixed-length pattern-atom, the one closest to the median is selected.
* Once F is determined, do_patfixed() is invoked to determine all possible matches of F with the input string.
* do_patsplit() has additional optimizations to try match the fixed length pattern in a restricted subset of the
* input string taking into account the min and max of the left and the right pattern.
* For each such match, an attempt is made to match L and R with the left and right side of the input string.
* This is done using do_patfixed() or do_pattern() appropriately.
* If any such attempt succeeds, we return TRUE.
* If no attempt succeeds, we return FALSE.
* If the pattern-string P doesn't contain any fixed-length pattern-atom, we return DO_PATSPLIT_FAIL
*/
int do_patsplit(mval *str, mval *pat)
{
int4 count, total_min, total_max;
int4 min[MAX_PATTERN_ATOMS], max[MAX_PATTERN_ATOMS], size[MAX_PATTERN_ATOMS];
int4 bytelen, charlen, charstoskip, fixedcharlen, leftcharlen, rightcharlen, deltalen, numchars;
int4 strbytelen, strcharlen;
int4 alt_rep_min, alt_rep_max;
int4 alt;
uint4 tempuint;
uint4 code, flags;
uint4 *patptr, *patptr_start, *patptr_end, *fixed_patptr, *right_patptr, *tmp_patptr;
ptstr left_ptstr, right_ptstr, fixed_ptstr;
mval left_pat, right_pat, fixed_pat, left_str, right_str, fixed_str;
int4 index, fixed_index; /* index of our current fixed-length pattern-atom */
boolean_t right; /* 0 indicates we are processing left side, 1 indicates right side */
boolean_t fixed[2]; /* fixed[0] is for the left, fixed[1] is for the right */
int4 tot_min[2], tot_max[2], cnt[2]; /* index 0 is for left, index 1 is for right */
int4 offset;
unsigned char *strptr, *strtop, *rightptr, *rightnext, *fixedptr, *fixednext, *maxfixedptr;
boolean_t match; /* match status of input pattern with input string */
gtm_uint64_t bound;
MV_FORCE_STR(str);
patptr = (uint4 *)pat->str.addr;
DEBUG_ONLY(
GET_ULONG(tempuint, patptr);
assert(!tempuint);
)
patptr++;
patptr_start = patptr + 1;
GET_ULONG(tempuint, patptr);
patptr += tempuint;
patptr_end = patptr;
GET_LONG(count, patptr);
patptr++;
GET_LONG(total_min, patptr);
patptr++;
GET_LONG(total_max, patptr);
patptr++;
UNICODE_ONLY(
if (gtm_utf8_mode)
{
MV_FORCE_LEN(str); /* to set str.char_len if not already done */
assert(str->str.char_len >= total_min && str->str.char_len <= total_max);
}
)
assert(count <= MAX_PATTERN_ATOMS);
memcpy(&min[0], patptr, SIZEOF(*patptr) * count);
patptr += count;
memcpy(&max[0], patptr, SIZEOF(*patptr) * count);
patptr += count;
memcpy(&size[0], patptr, SIZEOF(*patptr) * count);
patptr = patptr_start;
right = FALSE; /* start with left side */
fixed[right] = TRUE;
tot_min[right] = tot_max[right] = 0;
fixed_patptr = right_patptr = NULL;
fixed_index = -1;
for (index = 0; index < count; index++)
{
GET_ULONG(code, patptr);
tmp_patptr = patptr;
patptr++;
if (code & PATM_ALT)
{ /* skip to the next pattern-atom */
GET_LONG(alt_rep_min, patptr);
patptr++;
GET_LONG(alt_rep_max, patptr);
patptr++;
GET_LONG(alt, patptr);
patptr++;
while(alt)
{
patptr += alt;
GET_LONG(alt, patptr);
patptr++;
}
fixed[right] = FALSE;
/* patptr now points to the next patcode after the alternation */
} else if (code == PATM_DFA)
{ /* Discrete Finite Automaton pat atom */
assert(min[index] != max[index]); /* DFA should never be fixed length */
GET_LONG(bytelen, patptr);
patptr++;
patptr += bytelen;
fixed[right] = FALSE;
} else
{
if (code & PATM_STRLIT)
{ /* STRLIT pat atom */
assert(3 == PAT_STRLIT_PADDING);
GET_LONG(bytelen, patptr);
patptr++;
GET_LONG(charlen, patptr);
patptr++;
GET_ULONG(flags, patptr);
patptr++;
patptr += DIVIDE_ROUND_UP(bytelen, SIZEOF(*patptr));
}
if ((min[index] == max[index]) && (bound = (gtm_uint64_t)min[index] * size[index]))
{ /* fixed_length */
if (ABS(index - (count / 2)) < ABS(fixed_index - (count / 2)))
{ /* non-zero fixed length pattern with a fixed_index closer to the median of the array */
if (right)
{ /* update left's tot_min and tot_max to reflect the new fixed_index */
tot_min[0] += tot_min[right] +
BOUND_MULTIPLY(min[fixed_index], size[fixed_index], bound);
if (tot_min[0] > PAT_MAX_REPEAT)
tot_min[0] = PAT_MAX_REPEAT;
tot_max[0] += tot_max[right] +
BOUND_MULTIPLY(max[fixed_index], size[fixed_index], bound);
if (tot_max[0] > PAT_MAX_REPEAT)
tot_max[0] = PAT_MAX_REPEAT;
fixed[0] &= fixed[right];
}
fixed_index = index;
right = TRUE;
fixed[right] = TRUE;
tot_min[right] = tot_max[right] = 0;
fixed_patptr = tmp_patptr;
right_patptr = patptr;
continue;
}
} else
fixed[right] = FALSE;
}
tot_min[right] += BOUND_MULTIPLY(min[index], size[index], bound);
if (tot_min[right] > PAT_MAX_REPEAT)
tot_min[right] = PAT_MAX_REPEAT;
tot_max[right] += BOUND_MULTIPLY(max[index], size[index], bound);
if (tot_max[right] > PAT_MAX_REPEAT)
tot_max[right] = PAT_MAX_REPEAT;
}
assert(index == count);
if (-1 == fixed_index)
return DO_PATSPLIT_FAIL;
assert(fixed_index < count);
assert((total_min == (tot_min[0] + tot_min[1] + BOUND_MULTIPLY(min[fixed_index], size[fixed_index], bound))) ||
(PAT_MAX_REPEAT == total_min));
assert((total_max == (tot_max[0] + tot_max[1] + BOUND_MULTIPLY(max[fixed_index], size[fixed_index], bound))) ||
(PAT_MAX_REPEAT == total_max));
cnt[0] = fixed_index;
if (cnt[0])
{ /* left section has at least one pattern atom. create its compilation string */
patptr = left_ptstr.buff;
*patptr++ = fixed[0];
*patptr++ = (uint4)(fixed_patptr - patptr_start + 1);
memcpy(patptr, patptr_start, (char *)fixed_patptr - (char *)patptr_start);
patptr += fixed_patptr - patptr_start;
*patptr++ = cnt[0];
*patptr++ = tot_min[0];
*patptr++ = tot_max[0];
for (index = 0; index < cnt[0]; index++)
*patptr++ = min[index];
if (!fixed[0])
{
for (index = 0; index < cnt[0]; index++)
*patptr++ = max[index];
}
for (index = 0; index < cnt[0]; index++)
*patptr++ = size[index];
left_pat.mvtype = MV_STR;
left_pat.str.len = INTCAST((char *)patptr - (char *)&left_ptstr.buff[0]);
left_pat.str.addr = (char *)&left_ptstr.buff[0];
}
/* create fixed length pattern atom's compilation string */
patptr = fixed_ptstr.buff;
*patptr++ = TRUE; /* fixed length pattern */
*patptr++ = (uint4)(right_patptr - fixed_patptr + 1);
memcpy(patptr, fixed_patptr, (char *)right_patptr - (char *)fixed_patptr);
patptr += right_patptr - fixed_patptr;
*patptr++ = 1; /* count */
fixedcharlen = min[fixed_index] * size[fixed_index]; /* tot_min and tot_max */
*patptr++ = fixedcharlen;
*patptr++ = fixedcharlen;
*patptr++ = min[fixed_index]; /* min[0] */
*patptr++ = size[fixed_index]; /* size[0] */
fixed_pat.mvtype = MV_STR;
fixed_pat.str.len = INTCAST((char *)patptr - (char *)&fixed_ptstr.buff[0]);
fixed_pat.str.addr = (char *)&fixed_ptstr.buff[0];
cnt[1] = count - fixed_index - 1;
if (cnt[1])
{ /* right section has at least one pattern atom. create its compilation string */
patptr = right_ptstr.buff;
*patptr++ = fixed[1];
*patptr++ = (uint4)(patptr_end - right_patptr + 1);
memcpy(patptr, right_patptr, (char *)patptr_end - (char *)right_patptr);
patptr += patptr_end - right_patptr;
*patptr++ = cnt[1];
*patptr++ = tot_min[1];
*patptr++ = tot_max[1];
for (index = fixed_index + 1; index < count; index++)
*patptr++ = min[index];
if (!fixed[1])
{
for (index = fixed_index + 1; index < count; index++)
*patptr++ = max[index];
}
for (index = fixed_index + 1; index < count; index++)
*patptr++ = size[index];
right_pat.mvtype = MV_STR;
right_pat.str.len = INTCAST((char *)patptr - (char *)&right_ptstr.buff[0]);
right_pat.str.addr = (char *)&right_ptstr.buff[0];
}
strbytelen = str->str.len;
strptr = (unsigned char *)str->str.addr;
strtop = strptr + strbytelen;
if (!gtm_utf8_mode)
strcharlen = str->str.len;
UNICODE_ONLY(
else
strcharlen = str->str.char_len;
)
/* Determine "maxfixedptr" */
if (strcharlen > (tot_min[1] + tot_max[0] + fixedcharlen))
charstoskip = tot_max[0];
else
charstoskip = strcharlen - tot_min[1] - fixedcharlen;
if (!gtm_utf8_mode)
strptr += charstoskip;
UNICODE_ONLY(
else
{
for ( ; 0 < charstoskip; charstoskip--)
{
assert(strptr < strtop); /* below macro relies on this */
strptr = UTF8_MBNEXT(strptr, strtop);
}
}
)
maxfixedptr = strptr;
/* Determine "fixedptr" */
strptr = (unsigned char *)str->str.addr;
if (strcharlen > (tot_min[0] + tot_max[1] + fixedcharlen))
charstoskip = strcharlen - tot_max[1] - fixedcharlen;
else
charstoskip = tot_min[0];
leftcharlen = charstoskip;
if (!gtm_utf8_mode)
strptr += charstoskip;
UNICODE_ONLY(
else
{
for ( ; 0 < charstoskip; charstoskip--)
{
assert(strptr < strtop); /* below macro relies on this */
strptr = UTF8_MBNEXT(strptr, strtop);
}
}
)
fixedptr = strptr;
/* Set "left_str" */
left_str.mvtype = MV_STR;
left_str.str.addr = str->str.addr;
/* Set "right_str" */
right_str.mvtype = MV_STR;
/* Set "fixed_str" */
fixed_str.mvtype = MV_STR;
if (!gtm_utf8_mode)
{
fixed_str.str.len = fixedcharlen;
rightptr = fixedptr + fixedcharlen;
}
UNICODE_ONLY(
else
{
left_str.mvtype |= MV_UTF_LEN; /* avoid recomputing "char_len" in do_patfixed below */
right_str.mvtype |= MV_UTF_LEN; /* avoid recomputing "char_len" in do_patfixed below */
fixed_str.mvtype |= MV_UTF_LEN; /* avoid recomputing "char_len" in do_patfixed below */
fixed_str.str.char_len = fixedcharlen;
/* skip fixedcharlen characters */
assert(fixedptr == strptr);
for (numchars = 0; numchars < fixedcharlen; numchars++)
{
assert(strptr < strtop);
strptr = UTF8_MBNEXT(strptr, strtop);
}
rightptr = strptr;
}
)
deltalen = 0;
/* Try to match the fixed pattern string and for each match, try matching the left and right input strings */
for (match = FALSE; !match && (fixedptr <= maxfixedptr); fixedptr = fixednext, rightptr = rightnext, leftcharlen++)
{
fixed_str.str.addr = (char *)fixedptr;
fixed_str.str.len = INTCAST(rightptr - fixedptr);
if (!gtm_utf8_mode)
{
fixednext = fixedptr + 1;
rightnext = rightptr + 1;
}
UNICODE_ONLY(
else
{
assert(fixedptr < strtop);
fixednext = UTF8_MBNEXT(fixedptr, strtop);
assert((rightptr < strtop) || (fixedptr == maxfixedptr) && (fixednext > maxfixedptr));
if (rightptr < strtop)
rightnext = UTF8_MBNEXT(rightptr, strtop);
}
)
if (!do_patfixed(&fixed_str, &fixed_pat))
continue;
assert(cnt[0] || cnt[1]); /* fixed_pat takes only one pattern atom and non-zero rest are in cnt[0] and cnt[1] */
if (cnt[0])
{
left_str.str.len = INTCAST(fixedptr - (unsigned char *)left_str.str.addr);
UNICODE_ONLY(left_str.str.char_len = leftcharlen;)
match = fixed[0] ? do_patfixed(&left_str, &left_pat) : do_pattern(&left_str, &left_pat);
if (!match)
continue;
}
if (cnt[1])
{
right_str.str.addr = (char *)rightptr;
UNICODE_ONLY(right_str.str.char_len = strcharlen - leftcharlen - fixedcharlen;)
right_str.str.len = INTCAST(strtop - rightptr);
match = (fixed[1] ? do_patfixed(&right_str, &right_pat) : do_pattern(&right_str, &right_pat));
}
}
return match;
}