fis-gtm/sr_port/matchc.c

/****************************************************************
 *								*
 *	Copyright 2001, 2009 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"
#include "matchc.h"

#define	RETURN_NOMATCH			\
{					\
	*res = 0;			\
	assert(0 < numpcs_unmatched);	\
	*numpcs = numpcs_unmatched;	\
	return src_top;			\
}

#define	RETURN_YESMATCH(RET)		\
{					\
	*res = RET;			\
	assert(0 == numpcs_unmatched);	\
	*numpcs = 0;			\
	return src_ptr;			\
}

/*
 * -----------------------------------------------
 * Pseudo equivalent of VAX matchc instruction
 *
 * Arguments:
 *	del_len	- delimiter length
 *	del_str - pointer to delimiter string
 *	src_len	- length of source string
 *	src_str	- pointer to source string
 *	res	- pointer to the result
 *	numpcs  - pointer to the number of pieces that are desired to be matched.
 *
 * Return:
 *	pointer to next character after match substring
 *	in the source string, if found.  Otherwise src_str + src_len.
 *
 * Side effects:
 *	set res arg to:
 *		0 		- if match not found
 *		1 + char_len 	- if match found, where char_len is the position
 *				  of the next character after the match substring.
 *	set numpcs arg to # of pieces that could not be matched (because end of source string was reached before then)
 * -----------------------------------------------
 */
#ifdef UNICODE_SUPPORTED
#include "gtm_utf8.h"

GBLREF	boolean_t	gtm_utf8_mode;
GBLREF	boolean_t	badchar_inhibit;

/* multi-byte character-oriented substring matching */
unsigned char *matchc(int del_len, unsigned char *del_str, int src_len, unsigned char *src_str, int *res, int *numpcs)
{
	unsigned char 	*src_ptr, *src_top, *src_next, *del_ptr, *del_top, *del_next, *del_next1, *restart_ptr;
	wint_t		src_cp, del_cp, del_cp1;	/* code points for the source and delimiter characters */
	int		char_len, restart_char_len, del_charlen, bytelen, numpcs_unmatched;

	if (!gtm_utf8_mode)
		return matchb(del_len, del_str, src_len, src_str, res, numpcs);
	assert(0 <= del_len);
	assert(0 < *numpcs);
	if (0 == del_len)
	{	/* always matches a null string */
		*numpcs = 0;
		*res = 1;
		return src_str;
	}
	src_ptr = src_str;
	src_top = src_str + src_len;
	del_top = del_str + del_len;
	/* Check UTF8 byte sequence validity of delimiter string. The following code is very similar to utf8_len() but
	 * we dont invoke the function here for performance reasons as this piece of code is used by heavy hitters like $piece.
	 * Also, the code below can be forked off into two cases depending on the value of "badchar_inhibit". This is a
	 * performance enhancement that can be done later if this is found to be a bottleneck.
	 */
	if (!badchar_inhibit)
	{
		for (del_charlen = 0, del_ptr = del_str; del_ptr < del_top; del_charlen++, del_ptr += bytelen)
		{
			if (!UTF8_VALID(del_ptr, del_top, bytelen))
				utf8_badchar(0, del_ptr, del_top, 0, NULL);
		}
	}
	numpcs_unmatched = *numpcs;	/* note down # of pieces left to match */
	/* compute the code point of the 1st delimiter char */
	del_next1 = UTF8_MBTOWC(del_str, del_top, del_cp1);
	assert((WEOF != del_cp1) || badchar_inhibit);
	for (char_len = 0; (src_ptr < src_top) && (src_top - src_ptr) >= del_len; )
	{
		src_next = src_ptr;
		do
		{	/* find the occurrence of 1st delimiter char in the source */
			src_ptr = src_next;
			src_next = UTF8_MBTOWC(src_ptr, src_top, src_cp);
			if ((WEOF == src_cp) && !badchar_inhibit)
				utf8_badchar(0, src_ptr, src_top, 0, NULL);
			++char_len; /* maintain the source character position */
		} while ((src_next < src_top) && ((src_cp != del_cp1) || ((WEOF == src_cp) && (*src_ptr != *del_str))));

		if ((src_cp != del_cp1) || (WEOF == src_cp) && (*src_ptr != *del_str))
		{	/* could not find the 1st delimiter char in the source */
			RETURN_NOMATCH;
		}
		/* 1st delimiter character match found. match the other delimiter characters */
		del_ptr = del_next1; 		/* advance past the 1st delimiter character */
		restart_ptr = src_ptr = src_next; /* advance past the 1st source character */
		restart_char_len = char_len;
		for ( ; (src_ptr < src_top) && (del_ptr < del_top); src_ptr = src_next, del_ptr = del_next, ++char_len)
		{
			src_next = UTF8_MBTOWC(src_ptr, src_top, src_cp);
			if ((WEOF == src_cp) && !badchar_inhibit)
				utf8_badchar(0, src_ptr, src_top, 0, NULL);
			del_next = UTF8_MBTOWC(del_ptr, del_top, del_cp);
			if ((src_cp != del_cp) || ((WEOF == src_cp) && *src_ptr != *del_ptr))
			{	/* match lost. restart the search skipping the first delimiter character */
				src_ptr = restart_ptr;
				char_len = restart_char_len;
				break;
			}
		}
		if (del_ptr >= del_top)
		{	/* Match found : Return success if no more pieces to match else continue with scan */
			assert(del_top == del_ptr);
			assert(0 < numpcs_unmatched);
			if (0 == --numpcs_unmatched)
				RETURN_YESMATCH(1 + char_len);
		}
	}
	RETURN_NOMATCH;
}
#endif /* UNICODE_SUPPORTED */

/* byte-oriented substring matching */
unsigned char *matchb(int del_len, unsigned char *del_str, int src_len, unsigned char *src_str, int *res, int *numpcs)
{
	unsigned char 	*src_ptr, *pdel, *src_base, *src_top, *del_top;
	int 		src_cnt, numpcs_unmatched;
	boolean_t	match_found;

	assert(0 <= del_len);
	assert(0 < *numpcs);
	if (0 == del_len)
	{	/* always matches a null string */
		*numpcs = 0;
		*res = 1;
		return src_str;
	}
	numpcs_unmatched = *numpcs;	/* note down # of pieces to be matched */
	src_ptr = src_base = src_str;
	src_top = src_ptr + src_len;
	if (src_len < del_len)	/* Input string is shorter than delimiter string so no match possible */
		RETURN_NOMATCH;
	del_top = del_str + del_len;
	pdel = del_str;
	while (src_ptr < src_top)
	{
		/* Quick Find 1st delimiter char */
		while (*src_ptr != *pdel)
		{
			src_ptr = ++src_str;
			if (src_ptr == src_top)
				RETURN_NOMATCH;
		}
		match_found = FALSE;
		/* Found delimiter */
		while (*src_ptr++ == *pdel++)
		{
			if (pdel == del_top)
			{	/* Found matching piece. */
				match_found = TRUE;
				break;
			}
			if (src_ptr == src_top)
				RETURN_NOMATCH;
		}
		if (match_found)
		{	/* Return success if no more pieces to match else continue with scan */
			assert(0 < numpcs_unmatched);
			if (0 == --numpcs_unmatched)
				RETURN_YESMATCH(INTCAST(1 + (src_ptr - src_base)));
			src_str = src_ptr;
		} else
			src_ptr = ++src_str; /* Match lost, goto next source character */
		pdel = del_str;
	}
	RETURN_NOMATCH;
}
ENH: Initial import from sourceforge. These source tree was directly imported from http://sourceforge.net/projects/fis-gtm/files/GT.M-x86-Linux-src/V5.4-002B/ by extracting the file: gtm_V54002B_linux_i686_src.tar.gz 2012-02-05 11:35:58 -05:00			`/****************************************************************`
			`* *`
			`* Copyright 2001, 2009 Fidelity Information Services, Inc *`
			`* *`
			`* This source code contains the intellectual property *`
			`* of its copyright holder(s), and is made available *`
			`* under a license. If you do not know the terms of *`
			`* the license, please stop and do not read further. *`
			`* *`
			`****************************************************************/`

			`#include "mdef.h"`
			`#include "matchc.h"`

			`#define RETURN_NOMATCH \`
			`{ \`
			`*res = 0; \`
			`assert(0 < numpcs_unmatched); \`
			`*numpcs = numpcs_unmatched; \`
			`return src_top; \`
			`}`

			`#define RETURN_YESMATCH(RET) \`
			`{ \`
			`*res = RET; \`
			`assert(0 == numpcs_unmatched); \`
			`*numpcs = 0; \`
			`return src_ptr; \`
			`}`

			`/*`
			`* -----------------------------------------------`
			`* Pseudo equivalent of VAX matchc instruction`
			`*`
			`* Arguments:`
			`* del_len - delimiter length`
			`* del_str - pointer to delimiter string`
			`* src_len - length of source string`
			`* src_str - pointer to source string`
			`* res - pointer to the result`
			`* numpcs - pointer to the number of pieces that are desired to be matched.`
			`*`
			`* Return:`
			`* pointer to next character after match substring`
			`* in the source string, if found. Otherwise src_str + src_len.`
			`*`
			`* Side effects:`
			`* set res arg to:`
			`* 0 - if match not found`
			`* 1 + char_len - if match found, where char_len is the position`
			`* of the next character after the match substring.`
			`* set numpcs arg to # of pieces that could not be matched (because end of source string was reached before then)`
			`* -----------------------------------------------`
			`*/`
			`#ifdef UNICODE_SUPPORTED`
			`#include "gtm_utf8.h"`

			`GBLREF boolean_t gtm_utf8_mode;`
			`GBLREF boolean_t badchar_inhibit;`

			`/* multi-byte character-oriented substring matching */`
			`unsigned char matchc(int del_len, unsigned char del_str, int src_len, unsigned char src_str, int res, int *numpcs)`
			`{`
			`unsigned char src_ptr, src_top, src_next, del_ptr, del_top, del_next, del_next1, restart_ptr;`
			`wint_t src_cp, del_cp, del_cp1; /* code points for the source and delimiter characters */`
			`int char_len, restart_char_len, del_charlen, bytelen, numpcs_unmatched;`

			`if (!gtm_utf8_mode)`
			`return matchb(del_len, del_str, src_len, src_str, res, numpcs);`
			`assert(0 <= del_len);`
			`assert(0 < *numpcs);`
			`if (0 == del_len)`
			`{ /* always matches a null string */`
			`*numpcs = 0;`
			`*res = 1;`
			`return src_str;`
			`}`
			`src_ptr = src_str;`
			`src_top = src_str + src_len;`
			`del_top = del_str + del_len;`
			`/* Check UTF8 byte sequence validity of delimiter string. The following code is very similar to utf8_len() but`
			`* we dont invoke the function here for performance reasons as this piece of code is used by heavy hitters like $piece.`
			`* Also, the code below can be forked off into two cases depending on the value of "badchar_inhibit". This is a`
			`* performance enhancement that can be done later if this is found to be a bottleneck.`
			`*/`
			`if (!badchar_inhibit)`
			`{`
			`for (del_charlen = 0, del_ptr = del_str; del_ptr < del_top; del_charlen++, del_ptr += bytelen)`
			`{`
			`if (!UTF8_VALID(del_ptr, del_top, bytelen))`
			`utf8_badchar(0, del_ptr, del_top, 0, NULL);`
			`}`
			`}`
			`numpcs_unmatched = numpcs; / note down # of pieces left to match */`
			`/* compute the code point of the 1st delimiter char */`
			`del_next1 = UTF8_MBTOWC(del_str, del_top, del_cp1);`
			`assert((WEOF != del_cp1) \|\| badchar_inhibit);`
			`for (char_len = 0; (src_ptr < src_top) && (src_top - src_ptr) >= del_len; )`
			`{`
			`src_next = src_ptr;`
			`do`
			`{ /* find the occurrence of 1st delimiter char in the source */`
			`src_ptr = src_next;`
			`src_next = UTF8_MBTOWC(src_ptr, src_top, src_cp);`
			`if ((WEOF == src_cp) && !badchar_inhibit)`
			`utf8_badchar(0, src_ptr, src_top, 0, NULL);`
			`++char_len; /* maintain the source character position */`
			`} while ((src_next < src_top) && ((src_cp != del_cp1) \|\| ((WEOF == src_cp) && (src_ptr != del_str))));`

			`if ((src_cp != del_cp1) \|\| (WEOF == src_cp) && (src_ptr != del_str))`
			`{ /* could not find the 1st delimiter char in the source */`
			`RETURN_NOMATCH;`
			`}`
			`/* 1st delimiter character match found. match the other delimiter characters */`
			`del_ptr = del_next1; /* advance past the 1st delimiter character */`
			`restart_ptr = src_ptr = src_next; /* advance past the 1st source character */`
			`restart_char_len = char_len;`
			`for ( ; (src_ptr < src_top) && (del_ptr < del_top); src_ptr = src_next, del_ptr = del_next, ++char_len)`
			`{`
			`src_next = UTF8_MBTOWC(src_ptr, src_top, src_cp);`
			`if ((WEOF == src_cp) && !badchar_inhibit)`
			`utf8_badchar(0, src_ptr, src_top, 0, NULL);`
			`del_next = UTF8_MBTOWC(del_ptr, del_top, del_cp);`
			`if ((src_cp != del_cp) \|\| ((WEOF == src_cp) && src_ptr != del_ptr))`
			`{ /* match lost. restart the search skipping the first delimiter character */`
			`src_ptr = restart_ptr;`
			`char_len = restart_char_len;`
			`break;`
			`}`
			`}`
			`if (del_ptr >= del_top)`
			`{ /* Match found : Return success if no more pieces to match else continue with scan */`
			`assert(del_top == del_ptr);`
			`assert(0 < numpcs_unmatched);`
			`if (0 == --numpcs_unmatched)`
			`RETURN_YESMATCH(1 + char_len);`
			`}`
			`}`
			`RETURN_NOMATCH;`
			`}`
			`#endif /* UNICODE_SUPPORTED */`

			`/* byte-oriented substring matching */`
			`unsigned char matchb(int del_len, unsigned char del_str, int src_len, unsigned char src_str, int res, int *numpcs)`
			`{`
			`unsigned char src_ptr, pdel, src_base, src_top, *del_top;`
			`int src_cnt, numpcs_unmatched;`
			`boolean_t match_found;`

			`assert(0 <= del_len);`
			`assert(0 < *numpcs);`
			`if (0 == del_len)`
			`{ /* always matches a null string */`
			`*numpcs = 0;`
			`*res = 1;`
			`return src_str;`
			`}`
			`numpcs_unmatched = numpcs; / note down # of pieces to be matched */`
			`src_ptr = src_base = src_str;`
			`src_top = src_ptr + src_len;`
			`if (src_len < del_len) /* Input string is shorter than delimiter string so no match possible */`
			`RETURN_NOMATCH;`
			`del_top = del_str + del_len;`
			`pdel = del_str;`
			`while (src_ptr < src_top)`
			`{`
			`/* Quick Find 1st delimiter char */`
			`while (src_ptr != pdel)`
			`{`
			`src_ptr = ++src_str;`
			`if (src_ptr == src_top)`
			`RETURN_NOMATCH;`
			`}`
			`match_found = FALSE;`
			`/* Found delimiter */`
			`while (src_ptr++ == pdel++)`
			`{`
			`if (pdel == del_top)`
			`{ /* Found matching piece. */`
			`match_found = TRUE;`
			`break;`
			`}`
			`if (src_ptr == src_top)`
			`RETURN_NOMATCH;`
			`}`
			`if (match_found)`
			`{ /* Return success if no more pieces to match else continue with scan */`
			`assert(0 < numpcs_unmatched);`
			`if (0 == --numpcs_unmatched)`
			`RETURN_YESMATCH(INTCAST(1 + (src_ptr - src_base)));`
			`src_str = src_ptr;`
			`} else`
			`src_ptr = ++src_str; /* Match lost, goto next source character */`
			`pdel = del_str;`
			`}`
			`RETURN_NOMATCH;`
			`}`