fis-gtm/sr_unix/gtm_utf8.h

/****************************************************************
 *								*
 *	Copyright 2006, 2010 Fidelity Information Services, Inc.*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#ifndef GTM_UTF8_H
#define GTM_UTF8_H

#include <wctype.h>
#include <wchar.h>

/*
 *=======================================================================================================================
 * UTF-8 BIT DISTRIBUTION:
 *=======================================================================================================================
 *  Code range	    Scalar value		UTF-8					Notes
 *  hexadecimal	    binary			binary
 *  -----------------------------------------------------------------------------------------------------------------------
 *  000000-00007F   0xxxxxxx			0xxxxxxx				ASCII equivalence range;
 *		    seven x			seven x					byte begins with zero
 *
 *  000080-0007FF   00000xxx xxxxxxxx		110xxxxx 10xxxxxx			first byte begins with 110,
 *		    three x, eight x		five x, six x				the following byte begins with 10.
 *
 *  000800-00FFFF   xxxxxxxx xxxxxxxx		1110xxxx 10xxxxxx 10xxxxxx		first byte begins with 1110,
 *		    eight x, eight x		four x, six x, six x			the following bytes begin with 10.
 *
 *  010000-10FFFF   000xxxxx xxxxxxxx xxxxxxxx	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx	First byte begins with 11110,
 *		    five x, eight x, eight x	three x, six x, six x, six x		the following bytes begin with 10
 *
 *  ====================================================================================================================
 *  Codepoint  Codepoint binary	      UTF-8 binary			    UTF-8 hex
 *  hex
 *  --------------------------------------------------------------------------------------------------------------------
 *  000000     0		      0					    00		# 1-byte UTF-8 encoding BEGIN
 *  00007F     1111111		      1111111				    7F		# 1-byte UTF-8 encoding END
 *
 *  000080     00010000000	      11000010 10000000			    C2 80	# 2-byte UTF-8 encoding BEGIN
 *  0007FF     11111111111	      11011111 10111111			    DF BF	# 2-byte UTF-8 encoding END
 *
 *  000800     0000100000000000	      11100000 10100000 10000000	    E0 A0 80	# 3-byte UTF-8 encoding BEGIN
 *  00D7FF     1101011111111111	      11101101 10011111 10111111	    ED 9F BF	# 3-byte UTF-8 encoding PAUSE
 *
 *  00D800     1101100000000000	      11101101 10100000 10000000	    ED A0 80	# surrogate invalid range BEGIN
 *  00DFFF     1101111111111111	      11101101 10111111 10111111	    ED BF BF	# surrogate invalid range END
 *
 *  00E000     1110000000000000	      11101110 10000000 10000000	    EE 80 80	# 3-byte UTF-8 encoding RESUME
 *  00FFFF     1111111111111111	      11101111 10111111 10111111	    EF BF BF	# 3-byte UTF-8 encoding END
 *
 *  010000     000010000000000000000  11110000 10010000 10000000 10000000   F0 90 80 80 # 4-byte UTF-8 encoding BEGIN
 *  10FFFF     100001111111111111111  11110100 10001111 10111111 10111111   F4 8F BF BF # 4-byte UTF-8 encoding END
 *  ====================================================================================================================
 *
 *======================================================================================================================
 *  UTF-16 BIT DISTRIBUTION:
 *======================================================================================================================
 *  Code range	    Scalar value		UTF-16					Notes
 *  hexadecimal	    binary			binary
 *  --------------------------------------------------------------------------------------------------------------------
 *  000000-00FFFF   xxxxxxxxxxxxxxxx            xxxxxxxxxxxxxxxx                        All code points in the Basic
 *                  sixteen x                   sixteen x                               Multi-lingual Plane (BMP)
 *
 *  010000-10FFFF   000uuuuuxxxxxxxxxxxxxxxx    110110wwwwxxxxxx 110111xxxxxxxxxx       All code points in the
 *                  five u, sixteen x           four w, six x, ten x                    Suplementary Plane (non-BMP)
 *
 *                                where wwww = uuuuu - 1
 *=======================================================================================================================
 */

#define UTF8_NAME		"UTF-8"

#define	CHSET_M_STR		"M"
#define	CHSET_UTF8_STR		UTF8_NAME

#define	UTF8_1BYTE_MAX		(unsigned)(wint_t)ASCII_MAX
#define	UTF8_2BYTE_MAX		(unsigned)(wint_t)0x7FF
#define	UTF8_3BYTE_MAX		(unsigned)(wint_t)0xFFFF
#define	UTF8_4BYTE_MAX		(unsigned)(wint_t)0x10FFFF
#define U_BMP_MAX		(unsigned)(wint_t)0xFFFF
#define	U_SURROGATE_BEGIN	(unsigned)(wint_t)0xD800
#define	U_HIGH_SURROGATE_BEGIN	(unsigned)(wint_t)0xD800
#define	U_LOW_SURROGATE_BEGIN	(unsigned)(wint_t)0xDC00
#define	U_SURROGATE_END		(unsigned)(wint_t)0xDFFF

#define	UTF8_LEAD_2BYTEMASK	0x1F	/* extract out the x-s in 110xxxxx */
#define	UTF8_LEAD_3BYTEMASK	0x0F	/* extract out the x-s in 1110xxxx */
#define	UTF8_LEAD_4BYTEMASK	0x07	/* extract out the x-s in 11110xxx */
#define	UTF8_NONLEAD_BYTEMASK	0x3F	/* extract out the x-s in 10xxxxxx */

#define	UTF8_LEAD_2BYTE_PREFIX		0xC0	/* 110xxxxx where x is replaced with 0 */
#define	UTF8_LEAD_3BYTE_PREFIX		0xE0	/* 1110xxxx where x is replaced with 0 */
#define	UTF8_LEAD_4BYTE_PREFIX		0xF0	/* 11110xxx where x is replaced with 0 */
#define	UTF8_NONLEAD_BYTE_PREFIX	0x80	/* 10xxxxxx where x is replaced with 0 */

#define	UTF8_LEAD_2BYTE_BITLEN	5	/* number of bits extracted from the leading byte of a 2-byte UTF-8 encoding */
#define	UTF8_LEAD_3BYTE_BITLEN	4	/* number of bits extracted from the leading byte of a 3-byte UTF-8 encoding */
#define	UTF8_LEAD_4BYTE_BITLEN	3	/* number of bits extracted from the leading byte of a 4-byte UTF-8 encoding */
#define	UTF8_NONLEAD_BITLEN	6	/* number of bits extracted from the nonleading byte of a UTF-8 encoding */

#define	UTF_LINE_SEPARATOR	0x2028
#define	UTF_PARA_SEPARATOR	0x2029

#define	UTF8_SURROGATE_BYTELEN	3

#define GTM_MB_LEN_MAX		4	/* maximum bytes we support for multi-byte char */
#define GTM_MB_DISP_LEN_MAX	2	/* maximum number of display columns for a multi-byte char.
					 * all characters we know fit in a display width of 2 columns
					 */

/* This macro checks if a byte is a possible valid non-leading byte in a UTF-8 byte stream */
#define	UTF8_IS_VALID_TRAILING(x)	(((unsigned char)(x) & 0xc0) == 0x80)

/* This macro checks if a byte is a possible valid leading byte in a UTF-8 byte stream */
#define	UTF8_IS_VALID_LEADING(x)	(-1 != (int)utf8_followlen[(unsigned char)(x)])

/* boolean_t U_IS_SURROGATE_CODE(wint_t c)
 * 	Returns TRUE if the code point (c) of a character falls in the surrogate range.
 * 	Returns 0 otherwise.
 */
#define U_IS_SURROGATE_CODE(codepoint)					\
	(((unsigned)(codepoint) >= U_SURROGATE_BEGIN) 			\
	 	&& ((unsigned)(codepoint) <= U_SURROGATE_END))

/* boolean_t U_IS_SURROGATE_HIGH(wint_t c)
 * 	Returns TRUE if the code point (c) of a character is a leading (high) surrogate.
 * 	Returns 0 otherwise.
 */
#define U_IS_SURROGATE_HIGH(codepoint)					\
	(((unsigned)(codepoint) >= U_SURROGATE_BEGIN)			\
		&& ((unsigned)(codepoint) < U_LOW_SURROGATE_BEGIN))

/* boolean_t U_IS_SURROGATE_LOW(wint_t c)
 * 	Returns TRUE if the code point (c) of a character is a trailing (low) surrogate.
 * 	Returns 0 otherwise.
 */
#define U_IS_SURROGATE_LOW(codepoint)					\
	(((unsigned)(codepoint) >= U_LOW_SURROGATE_BEGIN)		\
		&& ((unsigned)(codepoint) <= U_SURROGATE_END))

/* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns TRUE.
 * If not a valid 2-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1.
 * Assumes "mbptr" is of type "uchar_ptr_t".
 */
#define	UTF8_VALID_2BYTE(mbptr, bytelen)						\
	((UTF8_IS_VALID_TRAILING(mbptr[1]))						\
		? TRUE									\
		: (bytelen = 1, FALSE))

/* boolean_t UTF8_NONCHAR_CODE_3BYTE(wint_t c)
 * boolean_t UTF8_NONCHAR_CODE_4BYTE(wint_t c)
 * 	Each of these macros returns TRUE if the code point (c) of a 3-byte or 4-byte wide-character is noncharacter, or
 * 	FALSE otherwise. Noncharacters are the code points that do not have valid character
 * 	assignment. This set includes:
 * 		U+FDD0 - U+FDEF (32 code points, all of which are 3-byte encoded)
 * 		All U+nFFFE and U+nFFFF, for each n from 0x0 to 0x10 (total of 34 code points,
 * 		of which U+FFFE and U+FFFF are 3-byte encoded and rest are 4-byte encoded)
 */
#define UTF8_NONCHAR_CODE_3BYTE(codepoint)								\
	((unsigned)(codepoint) >= 0xFDD0								\
	 	&& ((unsigned)(codepoint) <= 0xFDEF || ((unsigned)(codepoint) & 0xFFFE) == 0xFFFE))
#define UTF8_NONCHAR_CODE_4BYTE(codepoint)								\
	(((unsigned)(codepoint) & 0xFFFE) == 0xFFFE)

/* boolean_t UTF8_NONCHAR_CODEPOINT(wint_t c)
 * Returns TRUE if the codepoint (c) of ANY multi-byte character is a noncharacter or FALSE otherwise.
 * It assumes that UTF8_NONCHAR_CODE_3BYTE macro returns the correct values for any codepoint < UTF8_3BYTE_MAX
 * (including 1-byte and 2-byte codepoints).
 */
#define	UTF8_NONCHAR_CODEPOINT(codepoint)							\
	(((unsigned)(codepoint) <= UTF8_3BYTE_MAX) && (UTF8_NONCHAR_CODE_3BYTE(codepoint))	\
		||  UTF8_NONCHAR_CODE_4BYTE(codepoint))

/* boolean_t UTF8_NONCHAR_3BYTE(char* mbptr)
 * boolean_t UTF8_NONCHAR_4BYTE(char* mbptr)
 * 	Each of these macros returns TRUE if mbptr points to a noncharacter as described above
 * 	(or FALSE otherwise), except that the checks are performed on the UTF-8 byte stream
 * 	instead of the code points. Below are the equivalent byte patterns:
 * 	U+FDD0 - U+FDEF:
 * 		0xEF 0xB7 0x90  - 0xEF 0xB7 0xAF			(32 code points)
 * 	All U+nFFFE and U+nFFFF with the last two bytes having the byte patterns:
 * 		0xEF 0xBF 0xBE and 0xEF 0xBF 0xBF			(U+FFFE, U+FFFF)
 * 		0xF0 0x9F 0xBF 0xBE and 0xF0 0x9F 0xBF 0xBF		(U+1FFFE, U+1FFFF)
 * 		0xF0 0xAF 0xBF 0xBE and 0xF0 0xAF 0xBF 0xBF		(U+2FFFE, U+2FFFF)
 * 		....
 * 		....
 * 		0xF4 0x8F 0xBF 0xBE and 0xF4 0x8F 0xBF 0xBF		(U+10FFFE, U+10FFFF)
 */
#define UTF8_NONCHAR_3BYTE(mbptr)								\
	(mbptr[0] == 0xEF && ((mbptr[1] == 0xB7 && ((unsigned)(mbptr[2] - 0x90) < 32))		\
			|| ((mbptr[1] == 0xBF) && (mbptr[2] & 0xBE) == 0xBE)))
#define UTF8_NONCHAR_4BYTE(mbptr)								\
	(((mbptr[1] & 0x0F) == 0x0F) && (mbptr[2] == 0xBF) && (mbptr[3] & 0xBE) == 0xBE)

/* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns TRUE.
 * If not a valid 3-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1.
 * Assumes "mbptr" is of type "uchar_ptr_t".
 */
#define	UTF8_VALID_3BYTE(mbptr, bytelen)											\
	((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2])							\
			&& ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ 	\
			&& ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ 	\
	  		&& !UTF8_NONCHAR_3BYTE(mbptr))	/* ensure bytestream is NOT a 3-byte noncharacter */			\
		? TRUE														\
		: (bytelen = 1, FALSE))

/* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns TRUE.
 * If not a valid 4-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1.
 * Assumes "mbptr" is of type "uchar_ptr_t".
 */
#define	UTF8_VALID_4BYTE(mbptr, bytelen)											\
	((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3])		\
			&& ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ 	\
			&& ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */		\
	  		&& !UTF8_NONCHAR_4BYTE(mbptr))	/* ensure bytestream is NOT a 4-byte noncharacter */			\
		? TRUE														\
		: (bytelen = 1, FALSE))

/* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns (mbptr + 2) and sets codepoint appropriately.
 * If not a valid 2-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF.
 * Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]".
 */
#define	UTF8_MBTOWC_2BYTE(mbptr, codepoint)											\
	((UTF8_IS_VALID_TRAILING(mbptr[1]))											\
		? ((codepoint = ((codepoint & UTF8_LEAD_2BYTEMASK) << UTF8_NONLEAD_BITLEN) 					\
				| (mbptr[1] & UTF8_NONLEAD_BYTEMASK))								\
			, (mbptr + 2))												\
		: (codepoint = (wint_t)WEOF, (mbptr + 1)))

/* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns (mbptr + 3) and sets codepoint appropriately.
 * If not a valid 3-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF.
 * Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]".
 */
#define	UTF8_MBTOWC_3BYTE(mbptr, codepoint)											\
	((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2])							\
			&& ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ 	\
			&& ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ 	\
	  		&& !UTF8_NONCHAR_3BYTE(mbptr))	/* ensure bytestream is NOT a 3-byte noncharacter */			\
		? ((codepoint = ((((codepoint & UTF8_LEAD_3BYTEMASK)								\
				<< UTF8_NONLEAD_BITLEN) | (mbptr[1] & UTF8_NONLEAD_BYTEMASK))					\
				<< UTF8_NONLEAD_BITLEN) | (mbptr[2] & UTF8_NONLEAD_BYTEMASK))					\
			, (mbptr + 3))												\
		: (codepoint = (wint_t)WEOF, (mbptr + 1)))

/* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns (mbptr + 4) and sets codepoint appropriately.
 * If not a valid 4-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF.
 * Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]".
 */
#define	UTF8_MBTOWC_4BYTE(mbptr, codepoint)											\
	((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3])		\
			&& ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ 	\
			&& ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */		\
	  		&& !UTF8_NONCHAR_4BYTE(mbptr))	/* ensure bytestream is NOT a 4-byte noncharacter */			\
		? ((codepoint = ((((((codepoint & UTF8_LEAD_4BYTEMASK)								\
				<< UTF8_NONLEAD_BITLEN) | (mbptr[1] & UTF8_NONLEAD_BYTEMASK))					\
				<< UTF8_NONLEAD_BITLEN) | (mbptr[2] & UTF8_NONLEAD_BYTEMASK))					\
				<< UTF8_NONLEAD_BITLEN) | (mbptr[3] & UTF8_NONLEAD_BYTEMASK))					\
			, (mbptr + 4))												\
		: (codepoint = (wint_t)WEOF, (mbptr + 1)))

/* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns (mbptr + 2).
 * If not a valid 2-byte UTF-8 encoding, this macro returns (mbptr + 1).
 * Assumes "mbptr" is of type "uchar_ptr_t".
 */
#define	UTF8_MBNEXT_2BYTE(mbptr)							\
	(UTF8_IS_VALID_TRAILING(mbptr[1])						\
		? (mbptr + 2)								\
		: (mbptr + 1))

/* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns (mbptr + 3).
 * If not a valid 3-byte UTF-8 encoding, this macro returns (mbptr + 1).
 * Assumes "mbptr" is of type "uchar_ptr_t".
 */
#define	UTF8_MBNEXT_3BYTE(mbptr)												\
	((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2])							\
			&& ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ 	\
			&& ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ 	\
	  		&& !UTF8_NONCHAR_3BYTE(mbptr))	/* ensure bytestream is NOT a 3-byte noncharacter */			\
		? (mbptr + 3)													\
		: (mbptr + 1))

/* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns (mbptr + 4).
 * If not a valid 4-byte UTF-8 encoding, this macro returns (mbptr + 1).
 * Assumes "mbptr" is of type "uchar_ptr_t".
 */
#define	UTF8_MBNEXT_4BYTE(mbptr)												\
	((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3])		\
			&& ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ 	\
			&& ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */		\
	  		&& !UTF8_NONCHAR_4BYTE(mbptr))	/* ensure bytestream is NOT a 4-byte noncharacter */			\
		? (mbptr + 4)													\
		: (mbptr + 1))

LITREF unsigned int	utf8_bytelen[];

/* boolean_t UTF8_VALID(char *ptr, char *ptrend, unsigned int bytelen)
 *	Inspects bytes of the multi-byte UTF-8 string "ptr" upto "ptrend" and Returns TRUE if the
 *	byte sequence beginning at s forms a welformed and complete UTF-8 character, or FALSE
 *	otherwise. Sets "bytelen" to the byte length of the UTF-8 character if returning TRUE and
 *	to 1 if returning FALSE. A well-formed UTF-8 codepoint that is either a surrogate (in the
 *	range D800 - DFFF) or a noncharacter is considered invalid.  This macro assumes that
 *	"ptrend" is at least "ptr+1" and does not do any checks on this.
 */
#define	UTF8_VALID(mbptr, ptrend, bytelen)										\
	((bytelen) = utf8_bytelen[((uchar_ptr_t)(mbptr))[0]],								\
		(((((uchar_ptr_t)(mbptr))[0]) <= ASCII_MAX) ? TRUE		/* ASCII. Do simplest check first. */	\
		: (((bytelen) == 1) ? FALSE					/* Invalid leading byte */		\
		: (((int4)(bytelen) > (int4)(((uchar_ptr_t)(ptrend)) - ((uchar_ptr_t)(mbptr))))				\
			? (bytelen = 1, FALSE)					/* Not enough length in input string */	\
		: ((bytelen) == 2 ? UTF8_VALID_2BYTE(((uchar_ptr_t)(mbptr)), (bytelen))					\
		: ((bytelen) == 3 ? UTF8_VALID_3BYTE(((uchar_ptr_t)(mbptr)), (bytelen))					\
		: /* bytelen == 4 */UTF8_VALID_4BYTE(((uchar_ptr_t)(mbptr)), (bytelen))					\
	))))))

/* boolean_t U_VALID_CODE(wint_t codepoint)
 * 	Returns
 * 		TRUE if the code point of a character is a valid Unicode code point
 * 		FALSE otherwise.
 * 	Invalid code points include:
 * 		All surrogate code points
 * 		All noncharacter code points
 * 		All code points greater than U+10FFFF
 */
#define U_VALID_CODE(codepoint)												\
	(((unsigned)(codepoint) <= UTF8_4BYTE_MAX)									\
	 	&& !U_IS_SURROGATE_CODE(codepoint)									\
	 	&& !UTF8_NONCHAR_CODEPOINT(codepoint))

LITREF signed int 	utf8_followlen[];

/* int UTF8_MBFOLLOW(char *s)
 *	Inspects only the first byte of a multi-byte (or even an incomplete) UTF-8 string
 *	pointed at s, and returns the numbers of bytes to follow in order to form a complete
 *	character. The possible return values by this macro are 0, 1, 2 or 3.
 *	If the byte stored at s does not form a legal first-byte of UTF-8 character,
 *	it returns -1.
 */
#define	UTF8_MBFOLLOW(mbptr)	(utf8_followlen[((uchar_ptr_t)(mbptr))[0]])

/* int UTF16BE_MBFOLLOW(char *mbptr, char *ptrend)
 *	Inspects up to two bytes of a multi-byte (or even an incomplete) UTF-16BE string
 *	pointed at mbptr, and returns the numbers of bytes to follow the byte at mbptr in order
 *	to form a complete UTF-16 character in BIG-ENDIAN format. The valid return values by
 *	this macro are 1 and 3.  If the number of bytes between [mbptr, ptrend) is less than 2,
 *	the macro returns -1.
 */
#define	UTF16BE_MBFOLLOW(mbptr, ptrend)								\
	((ptrend - mbptr >= 2) ? (UTF16BE_HIGH_SURROGATE(mbptr) ? 3 : 1) : -1)


/* int UTF16LE_MBFOLLOW(char *mbptr, char *ptrend)
 *	Inspects up to two bytes of a multi-byte (or even an incomplete) UTF-16LE string
 *	pointed at mbptr, and returns the numbers of bytes to follow the byte at mbptr in order
 *	to form a complete UTF-16 character in LITTLE-ENDIAN format. The valid return values by
 *	this macro are 1 and 3.  If the number of bytes between [mbptr, ptrend) is less than 2,
 *	the macro returns -1.
 */
#define	UTF16LE_MBFOLLOW(mbptr, ptrend)								\
	((ptrend - mbptr >= 2) ? (UTF16LE_HIGH_SURROGATE(mbptr) ? 3 : 1) : -1)

/* boolean_t UTF16BE_VALID(char *ptr, char *ptrend, unsigned int bytelen)
 *	Inspects 2 or 4 bytes of the UTF-16BE string "ptr" upto "ptrend" and Returns TRUE
 *	if the byte sequence beginning at ptr forms a welformed and complete UTF-16 character
 *	in big-endian format, or FALSE otherwise. This macro also sets "bytelen" to 2 (for
 *	BMP characters) or 4 (for surrogate pair).
 *	NOTES:
 *		"bytelen" is always set irrespective of the validity of the code point (eg.
 *		it can be set to 4 for surrogate pair for which the macro returns FALSE
 *		because its code point is not valid (non-character).
 *
 *		"ptrend" is asummed to be at least "ptr+2"
 */
#define	UTF16BE_VALID(mbptr, ptrend, bytelen)							\
	(UTF16BE_HIGH_SURROGATE(mbptr) /* compute the code point first */			\
	 	? (((ptrend - mbptr) >= 4 && UTF16BE_LOW_SURROGATE(mbptr + 2))			\
			? ((UTF16BE_LOAD_SURROGATE(mbptr, bytelen), U_VALID_CODE(bytelen)) 	\
				? (bytelen = 4, TRUE) : (bytelen = 4, FALSE))			\
			: (bytelen = 2, FALSE))							\
		: (((bytelen = UTF16BE_GET_UNIT(mbptr)), U_VALID_CODE(bytelen))			\
			? (bytelen = 2, TRUE) : (bytelen = 2, FALSE)))

/* boolean_t UTF16LE_VALID(char *ptr, char *ptrend, unsigned int bytelen)
 *	Inspects 2 or 4 bytes of the UTF-16BE string "ptr" upto "ptrend" and Returns TRUE
 *	if the byte sequence beginning at ptr forms a welformed and complete UTF-16 character
 *	in little-endian format, or FALSE otherwise. This macro also sets "bytelen" to 2 (for
 *	BMP characters) or 4 (for surrogate pair).
 *	NOTES:
 *		"bytelen" is always set irrespective of the validity of the code point (eg.
 *		it can be set to 4 for surrogate pair for which the macro returns FALSE
 *		because its code point is not valid (non-character).
 *
 *		"ptrend" is asummed to be at least "ptr+2"
 */
#define	UTF16LE_VALID(mbptr, ptrend, bytelen)							\
	(UTF16LE_HIGH_SURROGATE(mbptr) /* compute the code point first */			\
	 	? (((ptrend - mbptr) >= 4 && UTF16LE_LOW_SURROGATE(mbptr + 2))			\
			? ((UTF16LE_LOAD_SURROGATE(mbptr, bytelen), U_VALID_CODE(bytelen)) 	\
				? (bytelen = 4, TRUE) : (bytelen = 4, FALSE))			\
			: (bytelen = 2, FALSE))							\
		: (((bytelen = UTF16LE_GET_UNIT(mbptr)), U_VALID_CODE(bytelen))			\
			? (bytelen = 2, TRUE) : (bytelen = 2, FALSE)))

/* unsigned char *UTF8_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint)
 *	Inspects bytes of the UTF-8 string upto ptrend and sets "codepoint" to the code point of
 *	the next character in the string. If the bytes starting from mbptr do not form a complete
 *	wellformed UTF-8 character, it sets "codepoint" to WEOF. Returns (mbptr+len) where "len"
 *	is the byte length of the UTF-8 character found.  If "codepoint" is set to WEOF, the return
 *	value is (mbptr+1).
 */
#define	UTF8_MBTOWC(mbptr, ptrend, codepoint)										\
	((codepoint) = (wint_t)(((uchar_ptr_t)(mbptr))[0]), 								\
		(((codepoint) <= ASCII_MAX) ? ((uchar_ptr_t)mbptr + 1)		/* ASCII. Do simplest check first. */	\
		: ((utf8_bytelen[(codepoint)] == 1)				/* Invalid leading byte */		\
			? ((codepoint) = (wint_t)WEOF, ((uchar_ptr_t)mbptr + 1))					\
		: (((int4)utf8_bytelen[(codepoint)]				/* Not enough length in input string */	\
				> (int4)(((uchar_ptr_t)(ptrend)) - ((uchar_ptr_t)(mbptr))))				\
			? ((codepoint) = (wint_t)WEOF, ((uchar_ptr_t)mbptr + 1))					\
		: (utf8_bytelen[(codepoint)] == 2 ? UTF8_MBTOWC_2BYTE(((uchar_ptr_t)(mbptr)), (codepoint))		\
		: (utf8_bytelen[(codepoint)] == 3 ? UTF8_MBTOWC_3BYTE(((uchar_ptr_t)(mbptr)), (codepoint))		\
		: /* utf8_bytelen[codepoint] == 4 */UTF8_MBTOWC_4BYTE(((uchar_ptr_t)(mbptr)), (codepoint))		\
	))))))

/* unsigned char* UTF8_MBNEXT(char *ptr, char *ptrend)
 *	Assuming that the string pointed at ptr is wellformed, it inspects bytes upto ptrend
 *	and advances the pointer by the number of bytes used by the character pointed at ptr.
 *	It returns the pointer to the beginning of the following character. If the bytes
 *	starting from s do not form a welformed character within the limits defined
 *	by ptrend, it returns the pointer to the next byte (i.e. s+1).
 */
#define	UTF8_MBNEXT(mbptr, ptrend)												\
	(((((uchar_ptr_t)(mbptr))[0]) <= ASCII_MAX) ? ((uchar_ptr_t)mbptr + 1)		/* ASCII. Do simplest check first. */	\
		: ((utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 1)			/* Invalid leading byte */		\
			? ((uchar_ptr_t)mbptr + 1)										\
		: (((int4)utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] > (int4)((uchar_ptr_t)ptrend - (uchar_ptr_t)mbptr))		\
			? ((uchar_ptr_t)mbptr + 1)					/* Not enough length in input string */	\
		: (utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 2 ? UTF8_MBNEXT_2BYTE(((uchar_ptr_t)(mbptr)))			\
		: (utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 3 ? UTF8_MBNEXT_3BYTE(((uchar_ptr_t)(mbptr)))			\
		: /* utf8_bytelen[(((uchar_ptr_t)mbptr)[0])] == 4 */UTF8_MBNEXT_4BYTE(((uchar_ptr_t)(mbptr)))			\
	)))))


/* unsigned char* UTF8_WCTOMB(wint_t c, char *s)
 *	Converts the code point of a character (c) to a sequence of bytes and stores
 *	the result (of 1 to 4 bytes long) at the beginning of the character array pointed
 *	to by s. It returns the pointer advanced by the number of bytes required for c.
 *	For invalid code points no conversion is done and and the macro returns s.
 */
#define	UTF8_WCTOMB(codepoint, mbptr)												\
	(((unsigned)(codepoint) <= UTF8_1BYTE_MAX)			/* 1-byte UTF-8 encoding */				\
		? (*((uchar_ptr_t)mbptr) = (unsigned char)(codepoint), ((uchar_ptr_t)mbptr) + 1)				\
	: (((unsigned)(codepoint) <= UTF8_2BYTE_MAX)			/* 2-byte UTF-8 encoding */				\
		? (*(((uchar_ptr_t)mbptr) + 1)											\
			= (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX),			\
			*((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> UTF8_NONLEAD_BITLEN) | UTF8_LEAD_2BYTE_PREFIX),	\
			((uchar_ptr_t)mbptr) + 2)										\
	: (((unsigned)(codepoint) <= UTF8_3BYTE_MAX)			/* 3-byte UTF-8 encoding */				\
		? ((U_IS_SURROGATE_CODE(codepoint) || UTF8_NONCHAR_CODE_3BYTE(codepoint))					\
			? ((uchar_ptr_t)mbptr)			/* Surrogate or noncharacter (3-byte case) */			\
			: (*(((uchar_ptr_t)mbptr) + 2)		/* Non-surrogate 3-byte case */					\
				= (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX),		\
				*(((uchar_ptr_t)mbptr) + 1)									\
					= (unsigned char)((((codepoint) >> UTF8_NONLEAD_BITLEN) & UTF8_NONLEAD_BYTEMASK)	\
									| UTF8_NONLEAD_BYTE_PREFIX),				\
				*((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> (2 * UTF8_NONLEAD_BITLEN))		\
									| UTF8_LEAD_3BYTE_PREFIX),				\
				((uchar_ptr_t)mbptr) + 3))									\
	: ((((unsigned)(codepoint) <= UTF8_4BYTE_MAX) && !UTF8_NONCHAR_CODE_4BYTE(codepoint))	/* 4-byte UTF-8 encoding */	\
		? (*(((uchar_ptr_t)mbptr) + 3)											\
			= (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX),			\
			*(((uchar_ptr_t)mbptr) + 2)										\
				= (unsigned char)((((codepoint) >> UTF8_NONLEAD_BITLEN) & UTF8_NONLEAD_BYTEMASK)		\
									| UTF8_NONLEAD_BYTE_PREFIX),				\
			*(((uchar_ptr_t)mbptr) + 1)										\
				= (unsigned char)((((codepoint) >> (2 * UTF8_NONLEAD_BITLEN)) & UTF8_NONLEAD_BYTEMASK)		\
									| UTF8_NONLEAD_BYTE_PREFIX),				\
			*((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> (3 * UTF8_NONLEAD_BITLEN))			\
								| UTF8_LEAD_4BYTE_PREFIX),					\
			((uchar_ptr_t)mbptr) + 4)										\
	: ((uchar_ptr_t)mbptr)))))

/* boolean_t UTF8_SURROGATE(char* s, char *ptrend)
 *	Inspects bytes of the multi-byte UTF-8 string upto ptrend and Returns TRUE if the
 *	byte sequence beginning at s forms a welformed UTF-8 character and an
 *	isolated surrogate character (either lower surrogate or upper surrogate).
 *	It returns FALSE, otherwise.
 */
#define	UTF8_SURROGATE(mbptr, ptrend)											\
	(((UTF8_SURROGATE_BYTELEN			/* maxlen should be at least 3-bytes */				\
			<= ((int4)((uchar_ptr_t)ptrend - (uchar_ptr_t)mbptr))) 						\
		&& (((uchar_ptr_t)mbptr)[0] == 0xED)	/* leading byte should be 0xED for surrogate UTF-8 */		\
		&& (((uchar_ptr_t)mbptr)[1] >= 0xA0)	/* first non-leading byte should be at least 0xA0 */		\
		&& (((uchar_ptr_t)mbptr)[1] <= 0xBF)	/* first non-leading byte should be at most  0xBF */		\
		&& (UTF8_IS_VALID_TRAILING(((uchar_ptr_t)mbptr)[2])))	/* second non-leading byte should be valid */	\
	? TRUE : FALSE)

/* void UTF8_LEADING_BYTE(char* mbptr, char* baseptr, char* leadptr)
 * 	Sets leadptr to point to the leading byte of the UTF-8 character containing the byte
 * 	pointed by mbptr. If the byte pointed by mbptr is not part of a valid UTF-8 character,
 * 	this macro sets leadptr to mbptr.
 * 	NOTE: mbptr and leadptr must not be the same variable.
 */
#define UTF8_LEADING_BYTE(mbptr, baseptr, leadptr)						\
{												\
	leadptr = mbptr;									\
	while (leadptr >= baseptr && UTF8_IS_VALID_TRAILING(*(uchar_ptr_t)leadptr))		\
		--leadptr;									\
	if (leadptr < baseptr || !UTF8_IS_VALID_LEADING(*(uchar_ptr_t)leadptr) || 		\
			(mbptr - leadptr) > utf8_followlen[*(uchar_ptr_t)leadptr])		\
		leadptr = mbptr;								\
}

/* Macros to return the UTF-16 (16-bit) code units from a given code point in the supplementary plane.
 * Note: these macros must be called only for the supplementary code points (> U_BMP_MAX) that are <= UTF8_4BYTE_MAX */
#define UTF16_HIGH_SURROGATE(codepoint)									\
	(U_HIGH_SURROGATE_BEGIN | ((((codepoint) >> 16) - 1) << 6) | (((codepoint) >> 10) & 0x3F))
#define UTF16_LOW_SURROGATE(codepoint) 									\
	(U_LOW_SURROGATE_BEGIN | ((codepoint) & 0x3FF))

/* Composes a surrogate pair and returns the code point in the supplementary plane */
#define UTF16_COMPOSE_SURROGATES(high, low)								\
	((((((high) >> 6) & 0xF) + 1) << 16) | (((high) & 0x3F) << 10) | ((low) & 0x3FF))

/* Macros to convert a UTF-16 (16-bit) code unit into a 2-byte sequence in the appropriate endianness.
 * The codeunits passed must be less than or equal to U_BMP_MAX */
#define UTF16BE_STORE_UNIT(mbptr, codeunit)								\
	((((uchar_ptr_t)mbptr)[0] = ((codeunit) >> 8)), (((uchar_ptr_t)mbptr)[1] = ((codeunit) & 0x00FF)))
#define UTF16LE_STORE_UNIT(mbptr, codeunit)								\
	((((uchar_ptr_t)mbptr)[1] = ((codeunit) >> 8)), (((uchar_ptr_t)mbptr)[0] = ((codeunit) & 0x00FF)))

/* macros to return a single UTF-16 (16-bit) codeunit given a 2-byte sequence */
#define UTF16BE_GET_UNIT(mbptr)										\
	((((uchar_ptr_t)mbptr)[0] << 8) | ((uchar_ptr_t)mbptr)[1])
#define UTF16LE_GET_UNIT(mbptr)										\
	((((uchar_ptr_t)mbptr)[1] << 8) | ((uchar_ptr_t)mbptr)[0])

/* macros to load UTF-16 surrogate codeunit pairs and return the code point in the supplementary plane.
 * Note: mbptr must point to a valid 4-byte sequence of high and low surrogates */
#define UTF16BE_LOAD_SURROGATE(mbptr, codepoint)							\
	 (codepoint = UTF16BE_GET_UNIT(mbptr), 								\
	  	codepoint = UTF16_COMPOSE_SURROGATES(codepoint, UTF16BE_GET_UNIT(mbptr+2)))
#define UTF16LE_LOAD_SURROGATE(mbptr, codepoint)							\
	 (codepoint = UTF16LE_GET_UNIT(mbptr), 								\
	  	codepoint = UTF16_COMPOSE_SURROGATES(codepoint, UTF16LE_GET_UNIT(mbptr+2)))

/* char* UTF16BE_WCTOMB(wint_t codepoint, char *mbptr)
 *	Converts the code point of a character (codepoint) in to big-endian UTF-16 bytes
 *	and stores the result (of 2 or 4 bytes long) at the beginning of the character
 *	array pointed to by mbptr. It returns the pointer advanced by the number of bytes
 *	required for codepoint. For invalid code points, no conversion is done and and
 *	the macro returns mbptr.
 */
#define	UTF16BE_WCTOMB(codepoint, mbptr)									\
	(U_VALID_CODE(codepoint)										\
	 	? ((codepoint) <= U_BMP_MAX 									\
			? (UTF16BE_STORE_UNIT(mbptr, (codepoint)), mbptr + 2) 	/* code points in BMP */	\
			: (UTF16BE_STORE_UNIT(mbptr, UTF16_HIGH_SURROGATE(codepoint)), /* supplementary plane */\
			   UTF16BE_STORE_UNIT(mbptr + 2, UTF16_LOW_SURROGATE(codepoint)), mbptr + 4))		\
		: mbptr)

/* char* UTF16LE_WCTOMB(wint_t codepoint, char *mbptr)
 *	Converts the code point of a character (codepoint) in to little-endian UTF-16 bytes
 *	and stores the result (of 2 or 4 bytes long) at the beginning of the character
 *	array pointed to by mbptr. It returns the pointer advanced by the number of bytes
 *	required for codepoint. For invalid code points, no conversion is done and and
 *	the macro returns mbptr.
 */
#define	UTF16LE_WCTOMB(codepoint, mbptr)									\
	(U_VALID_CODE(codepoint)										\
	 	? ((codepoint) <= U_BMP_MAX		/* 16-bit characters */					\
			? (UTF16LE_STORE_UNIT(mbptr, (codepoint)), mbptr + 2) 	/* code points in BMP */	\
			: (UTF16LE_STORE_UNIT(mbptr, UTF16_HIGH_SURROGATE(codepoint)), /* supplementary plane */\
			   UTF16LE_STORE_UNIT(mbptr + 2, UTF16_LOW_SURROGATE(codepoint)), mbptr + 4))		\
		: mbptr)

/* char *UTF16BE_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint)
 *	Inspects 2 bytes (or 4 bytes if surrogates) of the UTF-16 string in big-endian and
 *	sets "codepoint" to the code point of the next character in the string. Returns
 *	(mbptr + len) where "len" is the byte length of the UTF-16BE character found. If
 *	the bytes starting from mbptr do not form a complete welformed UTF-16BE character,
 *	it sets codepoint to WEOF and return mbptr.
 */
#define	UTF16BE_MBTOWC(mbptr, ptrend, codepoint)						\
	((UTF16BE_HIGH_SURROGATE(mbptr) /* compute the code point first */			\
	 	? (((ptrend - mbptr) >= 4 && UTF16BE_LOW_SURROGATE(mbptr + 2))			\
			? UTF16BE_LOAD_SURROGATE(mbptr, codepoint) : (codepoint = WEOF))	\
		: (codepoint = UTF16BE_GET_UNIT(mbptr))),					\
	(U_VALID_CODE(codepoint)	/* validate the code point */				\
		? ((codepoint) <= U_BMP_MAX ? (mbptr + 2) : (mbptr + 4))			\
		: (((codepoint) = WEOF), mbptr)))

/* char *UTF16LE_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint)
 *	Inspects 2 bytes (or 4 bytes if surrogates) of the UTF-16 string in little-endian and
 *	sets "codepoint" to the code point of the next character in the string. Returns
 *	(mbptr + len) where "len" is the byte length of the UTF-16BE character found. If
 *	the bytes starting from mbptr do not form a complete welformed UTF-16LE character,
 *	it sets codepoint to WEOF and return mbptr.
 */
#define	UTF16LE_MBTOWC(mbptr, ptrend, codepoint)						\
	((UTF16LE_HIGH_SURROGATE(mbptr) /* compute the code point first */			\
	 	? (((ptrend - mbptr) >= 4 && UTF16LE_LOW_SURROGATE(mbptr + 2))			\
			? UTF16LE_LOAD_SURROGATE(mbptr, codepoint) : (codepoint = WEOF))	\
		: (codepoint = UTF16LE_GET_UNIT(mbptr))),					\
	(U_VALID_CODE(codepoint)	/* validate the code point */				\
		? ((codepoint) <= U_BMP_MAX ? (mbptr + 2) : (mbptr + 4))			\
		: (((codepoint) = WEOF), mbptr)))

/* boolean_t UTF16BE_HIGH_SURROGATE(char* mbptr)
 * 	Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
 * 	beginning at mbptr forms a welformed UTF-16BE high surrogate character (U+D800 - U+DBFF)
 * 	and FALSE otherwise.
 */
#define UTF16BE_HIGH_SURROGATE(mbptr)								\
	(U_IS_SURROGATE_HIGH(UTF16BE_GET_UNIT(mbptr)))

/* boolean_t UTF16LE_HIGH_SURROGATE(char* mbptr)
 * 	Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
 * 	beginning at mbptr forms a welformed UTF-16LE high surrogate character (U+D800 - U+DBFF)
 * 	and FALSE otherwise.
 */
#define UTF16LE_HIGH_SURROGATE(mbptr)								\
	(U_IS_SURROGATE_HIGH(UTF16LE_GET_UNIT(mbptr)))

/* boolean_t UTF16BE_LOW_SURROGATE(char* mbptr)
 * 	Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
 * 	beginning at mbptr forms a welformed UTF-16BE low surrogate character (U+DC00 - U+DFFF)
 * 	and FALSE otherwise.
 */
#define UTF16BE_LOW_SURROGATE(mbptr)								\
	(U_IS_SURROGATE_LOW(UTF16BE_GET_UNIT(mbptr)))

/* boolean_t UTF16LE_LOW_SURROGATE(char* mbptr)
 * 	Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
 * 	beginning at mbptr forms a welformed UTF-16LE low surrogate character (U+DC00 - U+DFFF)
 * 	and FALSE otherwise.
 */
#define UTF16LE_LOW_SURROGATE(mbptr)								\
	(U_IS_SURROGATE_LOW(UTF16LE_GET_UNIT(mbptr)))

/* The following macros provide the character classification for Unicode characters given their code points */
#define U_ISLOWER(c)	u_islower(c)
#define U_ISUPPER(c)	u_isupper(c)
#define U_ISALPHA(c)	u_isalpha(c)
#define U_ISCNTRL(c)	u_iscntrl(c)
#define U_ISDIGIT(c)	u_isdigit(c)
#define U_ISPUNCT(c)	u_ispunct(c)
#define U_ISSPACE(c)	u_isspace(c)
#define U_ISBLANK(c)	u_isblank(c)
#define U_ISGRAPH(c)	u_isgraph(c)
#define U_ISPRINT(c)	GTM_U_ISPRINT(c) /* see macro definition for why redirection needed */
#define U_ISTITLE(c)	u_istitle(c)
#define U_CHARTYPE(c)	u_charType(c)

/* uint4	CTYPEMASK(wint_t c)
 *
 * This macro assumes that "c" is a valid unicode codepoint.
 *
 * Returns a patcode from a code point (wide character wint_t) paralleling the way ICU library functions classify codepoints.
 * 	u_isalpha (for A)
 * 	u_isdigit (for N)
 * 	u_ispunct (for P)
 * 	u_iscntrl (for C)
 * But with the following adjustments.
 *	1) If $ZPATNUMERIC is not "UTF-8", non-ASCII decimal digits are classified as A.
 *	2) Non-decimal digits (Nl and No) are classified as A. Note: u_isdigit only matches decimal digits.
 *	3) Anything left is classified via u_isprint into either P or C. Note: u_isprint only matches non-control characters.
 * Note that the ISV $ZPATN[UMERIC] dictates how the pattern class N used in the pattern match operator is interpreted.
 * If $ZPATNUMERIC is "UTF-8", the pattern class N matches any decimal numeric character as defined by the Unicode standard.
 * If $ZPATNUMERIC is "M", GT.M restricts the pattern class N to match only ASCII digits 0-9 (i.e. ASCII 48-57).
 * The variable "utf8_patnumeric" is TRUE if $ZPATNUMERIC is "UTF-8".
 *
 * The above rules result in the following mapping
 *      --------------------------------------------------
 *      Unicode general category       GT.M patcode class
 *      --------------------------------------------------
 *	L* (all letters)	    -> A
 *	M* (all marks)		    -> P
 *	Nd (decimal numbers)	    -> N (if decimal digit is ASCII or $ZPATNUMERIC is "UTF-8", otherwise -> A)
 *	Nl (letter numbers)	    -> A (examples of Nl are Roman numerals)
 *	No (other numbers)	    -> A (examples of No are fractions)
 *	P* (all punctuation)	    -> P
 *	S* (all symbols)	    -> P
 *	Zs (spaces)		    -> P
 *	Zl (line separators)	    -> C
 *	Zp (paragraph separators)   -> C
 *	C* (all control codepoints) -> C
 *
 * For a description of the Unicode general categories see http://unicode.org/versions/Unicode4.0.0/ch04.pdf (section 4.5)
 *
 * E = A + P + N + C and the classifications A, P, N, and C are mutually exclusive.
 *
 * This means that PATM_UTF8_NONBASIC does not currently have any codepoints mapped to it.
 * It is being retained in case it is needed in the future.
 */

/* our mask to map non-decimal digits into the PATM_A  */
#define GTM_NA_MASK (U_GC_NL_MASK | U_GC_NO_MASK)

#define	CTYPEMASK(c)														\
	(U_ISALPHA(c) ?						/* alphabet */							\
		(U_ISLOWER(c) ? PATM_L				/* lower-case */						\
			: (U_ISUPPER(c) ? PATM_U		/* upper-case */						\
				: PATM_UTF8_ALPHABET))		/* unicode alphabet that is neither lower nor upper case */	\
		: (U_ISDIGIT(c)					/* ascii or non-ascii decimal digit */				\
			? ((utf8_patnumeric || IS_ASCII(c))	/* check $ZPATNUMERIC setting */				\
				? PATM_N			/* Ascii digit OR $ZPATNUMERIC set to "UTF-8" */		\
				: PATM_UTF8_ALPHABET)		/* $ZPATNUMERIC set to "M" and non-ascii decimal digit */	\
			: ((U_MASK(U_CHARTYPE(c)) & GTM_NA_MASK)/* put non-decimal digits in  */				\
				? PATM_UTF8_ALPHABET 		/* PATM_UTF8_ALPHABET */					\
				: (U_ISPUNCT(c) ? PATM_P	/* punctuation */						\
					:(U_ISCNTRL(c) ? PATM_C /* control */							\
								/* unicode character that is not part of any basic class */	\
						:(U_ISPRINT(c) 		/* if printable  */					\
							? PATM_P	/* punctuation */					\
							: PATM_C))))))	/* otherwise, control */

/* uint4	TYPEMASK(char *ptr, char *ptrend, char *ptrnext, wint_t codepoint)
 * Inspects bytes of a character (in UTF-8 format) starting at "ptr" upto "ptrend", and returns its patcode.
 * This macro is a replacment to the existing typemask[] table that works in both UTF-8 and non-UTF8 mode.
 * This macro should only be used by the compiler. This macro assumes that "gtm_utf8_mode" is TRUE.
 * The parameter "codepoint" is set to the codepoint which is the same thing that gets returned by the macro.
 */
#define	TYPEMASK(ptr, ptrend, ptrnext, codepoint)						\
	(IS_ASCII(*(ptr))									\
		? ((ptrnext = ptr + 1), (codepoint) = *(ptr), typemask[*(ptr)])			\
		: (ptrnext = UTF8_MBTOWC(ptr, ptrend, codepoint), CTYPEMASK(codepoint)))

/* uint4	PATTERN_TYPEMASK(char *ptr, char *ptrend, char *ptrnext, wint_t codepoint)
 * Inspects bytes of a character (in UTF-8 format) starting at "ptr" upto "ptrend", and returns its patcode.
 * This macro is a replacment to the existing pattern_typemask[] table that works in both UTF-8 and non-UTF8 mode.
 * This macro should only be used by the runtime. This macro assumes that "gtm_utf8_mode" is TRUE.
 * The parameter "codepoint" is set to the codepoint if it is a multi-byte UTF8 character.
 */
#define	PATTERN_TYPEMASK(ptr, ptrend, ptrnext, codepoint)					\
	(IS_ASCII(*(ptr))									\
		? ((ptrnext = ptr + 1), (codepoint) = *(ptr), pattern_typemask[*(ptr)])		\
		: (ptrnext = UTF8_MBTOWC(ptr, ptrend, codepoint), CTYPEMASK(codepoint)))


/* Returns the display column width of a character given its code point. This macro
 * returns -1 for control characters and 0 for non-spacing (combining) characters
 */
#define UTF8_WCWIDTH(c)	gtm_wcwidth((wint_t)(c))

/* The following macro is same as UTF8_WCWIDTH except that it returns 0 for unprintable valid characters as well.
 * It is primarily used by the IO code.
 */
#ifdef UNICODE_SUPPORTED
#define GTM_IO_WCWIDTH(CHAR,RET)		\
	if (utf8_active)			\
	{					\
		RET = UTF8_WCWIDTH(CHAR);	\
		RET = (0 > RET ? 0 : RET);	\
	} else					\
		RET = 1
#else
#define GTM_IO_WCWIDTH(CHAR,RET)	RET = 1
#endif

/* Offsets for use with u32_line_term[] */
#define U32_LT_LF	0
#define U32_LT_CR	1
#define U32_LT_NL	2
#define U32_LT_FF	3
#define U32_LT_LS	4
#define U32_LT_PS	5
#define U32_LT_LAST	5	/* not counting null sentinel */
#ifdef UNICODE_SUPPORTED
#include "gtm_icu_api.h"
int	trim_U16_line_term(UChar *buffer, int len);
#endif

/* There could be integral promotion/sign extension issues if short, int (or an integral type) is used for comparison. Avoid
 * such issues by definining BOM as a string
 */
#define UTF16BE_BOM		"\xFE\xFF"	/* Big Endian BYTE ORDER MARKER */
#define UTF16BE_BOM_LEN		STR_LIT_LEN(UTF16BE_BOM)

#define UTF16LE_BOM		"\xFF\xFE"	/* Little Endian BYTE ORDER MARKER */
#define UTF16LE_BOM_LEN		STR_LIT_LEN(UTF16LE_BOM)

#define UTF8_BOM		"\xEF\xBB\xBF"	/* No relevance to endian-ness, a UTF8 MARKER similar to UTF16_BOM */
#define UTF8_BOM_LEN		STR_LIT_LEN(UTF8_BOM)

#define UTF32BE_BOM		"\x00\x00\xFE\xFF"	/* Big Endian BYTE ORDER MARKER */
#define UTF32BE_BOM_LEN		STR_LIT_LEN(UTF32BE_BOM)

#define UTF32LE_BOM		"\xFF\xFE\x00\x00"	/* Little Endian BYTE ORDER MARKER */
#define UTF32LE_BOM_LEN		STR_LIT_LEN(UTF32LE_BOM)

#define	BOM_CODEPOINT		0xFEFF

#define	UTF8_BADCHAR(len, str, strtop, chset_len, chset)								\
	utf8_badchar((len), (unsigned char *)(str), (unsigned char *)(strtop), (chset_len), (unsigned char *)(chset))

#define	UTF8_BADCHAR_STX(len, str, strtop, chset_len, chset)								\
	utf8_badchar_stx((len), (unsigned char *)(str), (unsigned char *)(strtop), (chset_len), (unsigned char *)(chset))

#define	UTF8_LEN_STRICT(ptr, len)			\
	utf8_len_strict((unsigned char *)(ptr), (len))

/* This macro is needed to to ensure all Unicode line terminators are considered non-printable. As of this
 * writing, ICU's u_isprint returns TRUE for LS/PS (Line/Paragraph separator; codepoints 0x2028, 0x2029)
 * and this causes problems in extracting and loading data which contains these codepoints (in UTF8 mode).
 * Ideally, one should go through all the line terminators in u32_line_term[] array and check them.
 * But since this routine is performance intensive (called from mupip extract which can take hours for
 * huge databases) we avoid a loop and just check for LS/PS which we know dont work right with "u_isprint"
 */
#define	GTM_U_ISPRINT(code)												\
	((((UChar32)UTF_LINE_SEPARATOR == (UChar32)(code)) || ((UChar32)UTF_PARA_SEPARATOR == (UChar32)(code)))		\
		? FALSE													\
		: u_isprint(code))

GBLREF		boolean_t       utf8_patnumeric;
int		utf8_len(mstr* str);
int		utf8_len_stx(mstr* str);
int		utf8_len_strict(unsigned char* ptr, int len);
int		gtm_wcwidth(wint_t code);
int		gtm_wcswidth(unsigned char* ptr, int len, boolean_t strict, int nonprintwidth);
void		utf8_badchar(int len, unsigned char* str, unsigned char *strtop, int chset_len, unsigned char* chset);
void		utf8_badchar_stx(int len, unsigned char* str, unsigned char *strtop, int chset_len, unsigned char* chset);
unsigned char	*gtm_utf8_trim_invalid_tail(unsigned char *str, int len);

/* To prevent GTMSECSHR from pulling in the function "gtmwcswidth" (used in util_output.c) and in turn the entire Unicode
 * codebase, we define a function-pointer variable and initialize it at startup to NULL only in GTMSECSHR and not-null
 * in all the other executables.
 */
typedef	int	(*gtm_wcswidth_fnptr_t)(unsigned char* ptr, int len, boolean_t strict, int nonprintwidth);

GBLREF	gtm_wcswidth_fnptr_t	gtm_wcswidth_fnptr;	/* see comment in gtm_utf8.h about this typedef */

#endif /* GTM_UTF8_H */