fis-gtm/sr_port/matchc.c

/****************************************************************
 *								*
 *	Copyright 2001, 2009 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"
#include "matchc.h"

#define	RETURN_NOMATCH			\
{					\
	*res = 0;			\
	assert(0 < numpcs_unmatched);	\
	*numpcs = numpcs_unmatched;	\
	return src_top;			\
}

#define	RETURN_YESMATCH(RET)		\
{					\
	*res = RET;			\
	assert(0 == numpcs_unmatched);	\
	*numpcs = 0;			\
	return src_ptr;			\
}

/*
 * -----------------------------------------------
 * Pseudo equivalent of VAX matchc instruction
 *
 * Arguments:
 *	del_len	- delimiter length
 *	del_str - pointer to delimiter string
 *	src_len	- length of source string
 *	src_str	- pointer to source string
 *	res	- pointer to the result
 *	numpcs  - pointer to the number of pieces that are desired to be matched.
 *
 * Return:
 *	pointer to next character after match substring
 *	in the source string, if found.  Otherwise src_str + src_len.
 *
 * Side effects:
 *	set res arg to:
 *		0 		- if match not found
 *		1 + char_len 	- if match found, where char_len is the position
 *				  of the next character after the match substring.
 *	set numpcs arg to # of pieces that could not be matched (because end of source string was reached before then)
 * -----------------------------------------------
 */
#ifdef UNICODE_SUPPORTED
#include "gtm_utf8.h"

GBLREF	boolean_t	gtm_utf8_mode;
GBLREF	boolean_t	badchar_inhibit;

/* multi-byte character-oriented substring matching */
unsigned char *matchc(int del_len, unsigned char *del_str, int src_len, unsigned char *src_str, int *res, int *numpcs)
{
	unsigned char 	*src_ptr, *src_top, *src_next, *del_ptr, *del_top, *del_next, *del_next1, *restart_ptr;
	wint_t		src_cp, del_cp, del_cp1;	/* code points for the source and delimiter characters */
	int		char_len, restart_char_len, del_charlen, bytelen, numpcs_unmatched;

	if (!gtm_utf8_mode)
		return matchb(del_len, del_str, src_len, src_str, res, numpcs);
	assert(0 <= del_len);
	assert(0 < *numpcs);
	if (0 == del_len)
	{	/* always matches a null string */
		*numpcs = 0;
		*res = 1;
		return src_str;
	}
	src_ptr = src_str;
	src_top = src_str + src_len;
	del_top = del_str + del_len;
	/* Check UTF8 byte sequence validity of delimiter string. The following code is very similar to utf8_len() but
	 * we dont invoke the function here for performance reasons as this piece of code is used by heavy hitters like $piece.
	 * Also, the code below can be forked off into two cases depending on the value of "badchar_inhibit". This is a
	 * performance enhancement that can be done later if this is found to be a bottleneck.
	 */
	if (!badchar_inhibit)
	{
		for (del_charlen = 0, del_ptr = del_str; del_ptr < del_top; del_charlen++, del_ptr += bytelen)
		{
			if (!UTF8_VALID(del_ptr, del_top, bytelen))
				utf8_badchar(0, del_ptr, del_top, 0, NULL);
		}
	}
	numpcs_unmatched = *numpcs;	/* note down # of pieces left to match */
	/* compute the code point of the 1st delimiter char */
	del_next1 = UTF8_MBTOWC(del_str, del_top, del_cp1);
	assert((WEOF != del_cp1) || badchar_inhibit);
	for (char_len = 0; (src_ptr < src_top) && (src_top - src_ptr) >= del_len; )
	{
		src_next = src_ptr;
		do
		{	/* find the occurrence of 1st delimiter char in the source */
			src_ptr = src_next;
			src_next = UTF8_MBTOWC(src_ptr, src_top, src_cp);
			if ((WEOF == src_cp) && !badchar_inhibit)
				utf8_badchar(0, src_ptr, src_top, 0, NULL);
			++char_len; /* maintain the source character position */
		} while ((src_next < src_top) && ((src_cp != del_cp1) || ((WEOF == src_cp) && (*src_ptr != *del_str))));

		if ((src_cp != del_cp1) || (WEOF == src_cp) && (*src_ptr != *del_str))
		{	/* could not find the 1st delimiter char in the source */
			RETURN_NOMATCH;
		}
		/* 1st delimiter character match found. match the other delimiter characters */
		del_ptr = del_next1; 		/* advance past the 1st delimiter character */
		restart_ptr = src_ptr = src_next; /* advance past the 1st source character */
		restart_char_len = char_len;
		for ( ; (src_ptr < src_top) && (del_ptr < del_top); src_ptr = src_next, del_ptr = del_next, ++char_len)
		{
			src_next = UTF8_MBTOWC(src_ptr, src_top, src_cp);
			if ((WEOF == src_cp) && !badchar_inhibit)
				utf8_badchar(0, src_ptr, src_top, 0, NULL);
			del_next = UTF8_MBTOWC(del_ptr, del_top, del_cp);
			if ((src_cp != del_cp) || ((WEOF == src_cp) && *src_ptr != *del_ptr))
			{	/* match lost. restart the search skipping the first delimiter character */
				src_ptr = restart_ptr;
				char_len = restart_char_len;
				break;
			}
		}
		if (del_ptr >= del_top)
		{	/* Match found : Return success if no more pieces to match else continue with scan */
			assert(del_top == del_ptr);
			assert(0 < numpcs_unmatched);
			if (0 == --numpcs_unmatched)
				RETURN_YESMATCH(1 + char_len);
		}
	}
	RETURN_NOMATCH;
}
#endif /* UNICODE_SUPPORTED */

/* byte-oriented substring matching */
unsigned char *matchb(int del_len, unsigned char *del_str, int src_len, unsigned char *src_str, int *res, int *numpcs)
{
	unsigned char 	*src_ptr, *pdel, *src_base, *src_top, *del_top;
	int 		src_cnt, numpcs_unmatched;
	boolean_t	match_found;

	assert(0 <= del_len);
	assert(0 < *numpcs);
	if (0 == del_len)
	{	/* always matches a null string */
		*numpcs = 0;
		*res = 1;
		return src_str;
	}
	numpcs_unmatched = *numpcs;	/* note down # of pieces to be matched */
	src_ptr = src_base = src_str;
	src_top = src_ptr + src_len;
	if (src_len < del_len)	/* Input string is shorter than delimiter string so no match possible */
		RETURN_NOMATCH;
	del_top = del_str + del_len;
	pdel = del_str;
	while (src_ptr < src_top)
	{
		/* Quick Find 1st delimiter char */
		while (*src_ptr != *pdel)
		{
			src_ptr = ++src_str;
			if (src_ptr == src_top)
				RETURN_NOMATCH;
		}
		match_found = FALSE;
		/* Found delimiter */
		while (*src_ptr++ == *pdel++)
		{
			if (pdel == del_top)
			{	/* Found matching piece. */
				match_found = TRUE;
				break;
			}
			if (src_ptr == src_top)
				RETURN_NOMATCH;
		}
		if (match_found)
		{	/* Return success if no more pieces to match else continue with scan */
			assert(0 < numpcs_unmatched);
			if (0 == --numpcs_unmatched)
				RETURN_YESMATCH(INTCAST(1 + (src_ptr - src_base)));
			src_str = src_ptr;
		} else
			src_ptr = ++src_str; /* Match lost, goto next source character */
		pdel = del_str;
	}
	RETURN_NOMATCH;
}