fis-gtm/sr_port/patcode.h

339 lines
13 KiB
C

/****************************************************************
* *
* Copyright 2001, 2009 Fidelity Information Services, Inc *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#ifndef PATCODE_H_DEFINED
#define PATCODE_H_DEFINED
/* pattern operator constants */
#define PATM_N (1 << 0)
#define PATM_P (1 << 1)
#define PATM_L (1 << 2)
#define PATM_U (1 << 3)
#define PATM_C (1 << 4)
#define PATM_UTF8_ALPHABET (1 << 5) /* Unicode characters that are alphabets but not cased (i.e. neither L nor U) */
#define PATM_UTF8_NONBASIC (1 << 6) /* Currently not used, but retained in case needed in the future */
#define PAT_BASIC_CLASSES 7 /* the 5 canonic classes (defined by the M standard) plus the two PATM_UTF8_* classes */
#define PATM_STRLIT (1 << 7)
#define PATM_A (PATM_L | PATM_U | PATM_UTF8_ALPHABET)
#define PATM_E 0x1FFFF7F /* E matches **any** pattern code... */
/* Additional user-definable codes for Internationalization */
/* PATM_A already used for Alphabetic characters */
#define PATM_B (1 << 8)
/* PATM_C already used for Control characters */
#define PATM_D (1 << 9)
/* PATM_E already used for Entire set of characters */
#define PATM_F (1 << 10)
#define PATM_G (1 << 11)
#define PATM_H (1 << 12)
#define PATM_I (1 << 13)
#define PATM_J (1 << 14)
#define PATM_K (1 << 15)
/* PATM_L already used for Lower-case characters */
#define PATM_M (1 << 16)
/* PATM_N already used for Numeric characters */
#define PATM_O (1 << 17)
/* PATM_P already used for Punctuation characters */
#define PATM_Q (1 << 18)
#define PATM_R (1 << 19)
#define PATM_S (1 << 20)
#define PATM_T (1 << 21)
/* PATM_U already used for Upper-case characters */
#define PATM_V (1 << 22)
#define PATM_W (1 << 23)
#define PATM_X (1 << 24)
/* PATM_Y reserved for YanythingY extension */
/* PATM_Z reserved for ZanythingZ extension */
#define PATM_ALT (1 << 25)
#define PATM_YZ1 (1 << 26)
#define PATM_YZ2 (1 << 27)
#define PATM_YZ3 (1 << 28)
#define PATM_YZ4 (1 << 29)
#define DFABIT (1 << 30)
#define PAT_MAX_BITS 32
#define PAT_YZMAXNUM 4 /* maximum number of YxxxY and ZxxxZ pattern codes */
#define PAT_YZMAXLEN 8 /* maximum number of characters in name of YxxxY pattern code */
/* PATM_UTF8_ALPHABET and PATM_UTF8_NONBASIC are assigned an external name of 0 and 1 below as we have run out of alphabets */
#define PATM_CODELIST "NPLUCBDFGHIJKMOQRSTVWX01"
#define PATM_DFA 254 /* Deterministic Finite Automaton */
#define PATM_ACS 255
#define PATM_SHORTFLAGS 0x7F /* original 5 flags + the two recently introduced PATM_UTF8_* flags */
#define PATM_LONGFLAGS 0x1FFFF7F /* 24 flags (including the two recently introduced PATM_UTF8_* flags) */
#define PATM_I18NFLAGS 0x1FFFF00 /* flags introduced with Internationalization */
#define MAX_DFA_SPACE 170
#define PAT_MAX_REPEAT MAX_STRLEN
#define MAX_PATTERN_ATOMS 50
#define MAX_PATOBJ_LENGTH 2048
#define MAX_PATTERN_OVERHEAD ((3 * MAX_PATTERN_ATOMS) + 3) * 4/* maximum length (in integers) of compiled pattern code
* excluding count,tot_min,tot_max and min,max,size arrays
* that come at the tail. 3 * MAX_PATTERN_ATOMS is for
* min, max, size arrays and 3 is for count, tot_min, tot_max
* 4 is to convert the size into bytes.
*/
#define MAX_PATTERN_LENGTH (MAX_PATOBJ_LENGTH - MAX_PATTERN_OVERHEAD)
#define PATENTS 256 /* Size of the builtin pattern/typemask table in M mode */
#define PATENTS_UTF8 128 /* Size of the builtin pattern/typemask table in UTF-8 mode */
#define CHAR_CLASSES 24
#define PAT_STRLIT_PADDING 3 /* # of int4s for storing bytelen, charlen, flags in PATM_STRLIT type */
#define MAX_SYM 16
#define FST 0
#define LST 1
#define MAX_DFA_STRLEN 64 /* needs to be > PAT_BASIC_CLASSES */
#define MAX_DFA_REP 10
/* The macro to perform 64-bit multiplication without losing precision due to overflow with 32-bit
* multiplication. With the increase of PAT_MAX_REPEAT from 32K to 1MB, the values of result and
* the expression value can potentially be as large as (1MB * 1MB).
* NOTE: The macro expects result to be declared with type gtm_uint64_t */
#define BOUND_MULTIPLY(x, y, result) (((result = ((gtm_uint64_t)(x) * (y))) >= PAT_MAX_REPEAT) ? PAT_MAX_REPEAT : (int)(result))
/* Compiled Pattern
* -----------------
* 0x00000000 <-- fixed (1 if fixed length, 0 if not fixed length)
* 0x00000027 <-- length of pattern stream (inclusive of itself)
* 0x00000008 <-- begin of pattern mask
* ...
* 0x00000002 <-- count
* 0x00000001 <-- total_min
* 0x00007fff <-- total_max
* 0x00000000 <-- min[0] <-- Begin of min[2] array
* 0x00000001 <-- min[1]
* 0x00007fff <-- max[0] <-- Begin of max[2] array
* 0x00000001 <-- max[1]
* 0x00000001 <-- size[0] <-- Begin of size[2] array
* 0x00000001 <-- size[1]
*/
#define PAT_LEN_OFFSET 1 /* relative to beginning of compiled pattern which starts with "fixed" */
#define PAT_MASK_BEGIN_OFFSET 2 /* start of actual stream of pattern masks */
#define PAT_COUNT_OFFSET(len) (1 + len) /* including "fixed", "len" (of pattern stream) */
#define PAT_TOT_MIN_OFFSET(len) (1 + len + 1) /* including "fixed", "len" (of pattern stream) and "count" */
#define PAT_TOT_MAX_OFFSET(len) (1 + len + 2) /* including "fixed", "len" (of pattern stream), "count", "tot_min" */
/* Note that if fixed length pattern, max[0] is same as min[0], hence the (1 - fixed) calculations below */
#define PAT_MIN_BEGIN_OFFSET(fixed,len,count) (1 + len + 3) /* offset of min[0] */
#define PAT_MAX_BEGIN_OFFSET(fixed,len,count) (1 + len + 3 + (count * (1 - fixed))) /* offset of max[0] */
#define PAT_SIZE_BEGIN_OFFSET(fixed,len,count) (PAT_MAX_BEGIN_OFFSET(fixed,len,count) + count) /* offset of size[0] */
typedef struct pattern_struct
{
struct pattern_struct *flink;
uint4 *typemask;
unsigned char *patYZnam;
int *patYZlen;
int patYZnum;
int namlen;
char name[2]; /* must be last entity in structure */
} pattern;
struct leaf
{
boolean_t nullable[MAX_SYM];
int4 letter[MAX_SYM][MAX_DFA_STRLEN + 1];
};
struct node
{
boolean_t nullable[MAX_SYM];
boolean_t last[MAX_SYM][MAX_SYM];
};
struct e_table
{
int meta_c[CHAR_CLASSES][26],
num_e[CHAR_CLASSES];
};
struct c_trns_tb
{
int c[2 * MAX_SYM];
int4 p_msk[2 * MAX_SYM][CHAR_CLASSES];
int trns[2 * MAX_SYM][CHAR_CLASSES];
};
typedef struct ptstr_struct {
int4 len;
uint4 buff[MAX_PATOBJ_LENGTH];
} ptstr;
/* The following flags are the attributes of the pattern string literal (PATM_STRLIT) */
#define PATM_STRLIT_NONASCII (1 << 0) /* whether the string contains non-ASCII (multi-byte) characters */
#define PATM_STRLIT_BADCHAR (1 << 1) /* whether the string contains malformed UTF8 byte sequences */
typedef struct patstrlit_struct
{
int4 bytelen; /* number of bytes in the buffer */
int4 charlen; /* number of characters (single or multi-byte) in the PATM_STRLIT */
uint4 flags; /* various attributes about the pattern string literal */
unsigned char buff[MAX_PATTERN_LENGTH - 2];
} pat_strlit;
/* The following structure caches the evaluation status of whether a pattern string matches a given <strptr, strlen>.
* This remembering enables us to avoid recomputation (and hence a lot of CPU cycles) in case the same need arises again.
* Least frequently used pte_csh structures (i.e. having the smallest "count" field) will be preempted for new entries.
* More comments in gbldefs.c above the pte_csh* structure GBLDEFs.
*/
typedef struct pte_csh_struct {
char *patptr;
char *strptr;
int4 charlen;
int repcnt; /* recursion level */
uint4 count; /* indicates frequency of access. The least "count" valued entry gets pre-empted */
boolean_t match;
} pte_csh;
#define PTE_BEGIN_ENTRIES 1024 /* initial pte_csh_array size. if too high, causes us initializing overhead.
* if too less, doesn't maintain most frequently accessed patterns.
*/
#define PTE_MAX_ENTRIES 16384 /* maximum pte_csh array size */
#define PTE_STRLEN_CUTOFF 48 /* strlen value >= PTE_STRLEN_CUTOFF use up the tail of the pt_csh array */
#define PTE_FILL_RATIO_NUM 3 /* fill ratio is percentage of pte_csh_array that has 1-1 correspondence with "strlen" */
#define PTE_FILL_RATIO_DEN 4 /* this is expressed as the fraction PTE_FILL_RATIO_NUM/PTE_FILL_RATIO_DEN */
#define PTE_NOT_FOUND -1
#define MIN_SPLIT_N_MATCH_COUNT 3 /* need at least 3 pattern atoms to try splitting the pattern and matching subpatterns */
#define DO_PATSPLIT_FAIL -1 /* attempt to split (at fixed pattern) and match left and right subpatterns failed */
#define PTE_CSH_MISS_FACTOR 8 /* i.e. we allow 1/8 to be the maximum cache miss percent */
#define PTE_MAX_CURALT_DEPTH 2 /* max. number of levels of alternation nesting for which we maintain a pte_csh array */
#define UPDATE_CUR_PTE_CSH_MINUS_ARRAY(cur_pte_csh_size, cur_pte_csh_entries_per_len, cur_pte_csh_tail_count) \
{ \
cur_pte_csh_size = pte_csh_cur_size[curalt_depth]; \
cur_pte_csh_entries_per_len = pte_csh_entries_per_len[curalt_depth]; \
cur_pte_csh_tail_count = pte_csh_tail_count[curalt_depth]; \
}
#define UPDATE_CUR_PTE_CSH(cur_pte_csh_array, cur_pte_csh_size, cur_pte_csh_entries_per_len, cur_pte_csh_tail_count) \
{ \
cur_pte_csh_array = pte_csh_array[curalt_depth]; \
UPDATE_CUR_PTE_CSH_MINUS_ARRAY(cur_pte_csh_size, cur_pte_csh_entries_per_len, cur_pte_csh_tail_count) \
}
#define PTE_CSH_INCR_CURALT_DEPTH(curalt_depth) \
{ \
curalt_depth++; \
assert(0 <= curalt_depth); \
if (PTE_MAX_CURALT_DEPTH > curalt_depth) \
{ \
if (NULL == pte_csh_array[curalt_depth]) \
{ \
pte_csh_array[curalt_depth] = malloc(SIZEOF(pte_csh) * PTE_BEGIN_ENTRIES); \
pte_csh_alloc_size[curalt_depth] = PTE_BEGIN_ENTRIES; \
} \
pte_csh_entries_per_len[curalt_depth] = ((PTE_BEGIN_ENTRIES / PTE_FILL_RATIO_DEN) * PTE_FILL_RATIO_NUM) \
/ PTE_STRLEN_CUTOFF; \
pte_csh_tail_count[curalt_depth] = (PTE_BEGIN_ENTRIES / PTE_FILL_RATIO_DEN) * \
(PTE_FILL_RATIO_DEN - PTE_FILL_RATIO_NUM); \
pte_csh_cur_size[curalt_depth] = PTE_BEGIN_ENTRIES; \
assert(pte_csh_cur_size[curalt_depth] <= pte_csh_alloc_size[curalt_depth]); \
memset(pte_csh_array[curalt_depth], 0, SIZEOF(pte_csh) * PTE_BEGIN_ENTRIES); \
/* reset cur_pte_csh* globals to point to new curalt_depth */ \
UPDATE_CUR_PTE_CSH(cur_pte_csh_array, cur_pte_csh_size, cur_pte_csh_entries_per_len, cur_pte_csh_tail_count); \
do_patalt_hits[curalt_depth] = do_patalt_calls[curalt_depth] = 1; \
} \
}
#define PTE_CSH_DECR_CURALT_DEPTH(curalt_depth) \
{ \
assert(0 <= curalt_depth); \
curalt_depth--; \
if ((PTE_MAX_CURALT_DEPTH > curalt_depth) && (0 <= curalt_depth)) \
{ /* reset cur_pte_csh* globals to point to new curalt_depth */ \
UPDATE_CUR_PTE_CSH(cur_pte_csh_array, cur_pte_csh_size, cur_pte_csh_entries_per_len, cur_pte_csh_tail_count); \
} \
}
int do_patalt(
uint4 *firstalt,
unsigned char *strptr,
unsigned char *strtop,
int4 repmin,
int4 repmax,
int totchar,
int repcnt,
int4 min_incr,
int4 max_incr);
int do_patfixed(mval *str, mval *pat);
int do_pattern(mval *str, mval *pat);
int do_patsplit(mval *str, mval *pat);
void genpat(mstr *input, mval *patbuf);
int getpattabnam(mstr *outname);
int initialize_pattern_table(void);
int load_pattern_table(int name_len, char *file_name);
int patmaskseq(uint4 number);
int setpattab(mstr *table_name);
int patstr(mstr *instr, ptstr *obj, unsigned char **relay);
int dfa_calc(
struct leaf *leaves,
int leaf_num,
struct e_table *expand,
uint4 **fstchar_ptr,
uint4 **outchar_ptr);
boolean_t pat_unwind(
int *count,
struct leaf *leaves,
int leaf_num,
int *total_min,
int *total_max,
int min[],
int max[],
int size[],
int altmin,
int altmax,
boolean_t *last_infinite_ptr,
uint4 **fstchar_ptr,
uint4 **outchar_ptr,
uint4 **lastpatptr_ptr);
boolean_t add_atom(
int *count,
uint4 pattern_mask,
pat_strlit *strlit_buff,
boolean_t infinite,
int *min,
int *max,
int *size,
int *total_min,
int *total_max,
int lower_bound,
int upper_bound,
int altmin,
int altmax,
boolean_t *last_infinite_ptr,
uint4 **fstchar_ptr,
uint4 **outchar_ptr,
uint4 **lastpatptr_ptr);
int pat_compress(
uint4 pattern_mask,
pat_strlit *strlit_buff,
boolean_t infinite,
boolean_t last_infinite,
uint4 *lastpatptr);
#endif