fis-gtm/sr_unix/gtm_utf8.h

837 lines
44 KiB
C

/****************************************************************
* *
* Copyright 2006, 2010 Fidelity Information Services, Inc.*
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#ifndef GTM_UTF8_H
#define GTM_UTF8_H
#include <wctype.h>
#include <wchar.h>
/*
*=======================================================================================================================
* UTF-8 BIT DISTRIBUTION:
*=======================================================================================================================
* Code range Scalar value UTF-8 Notes
* hexadecimal binary binary
* -----------------------------------------------------------------------------------------------------------------------
* 000000-00007F 0xxxxxxx 0xxxxxxx ASCII equivalence range;
* seven x seven x byte begins with zero
*
* 000080-0007FF 00000xxx xxxxxxxx 110xxxxx 10xxxxxx first byte begins with 110,
* three x, eight x five x, six x the following byte begins with 10.
*
* 000800-00FFFF xxxxxxxx xxxxxxxx 1110xxxx 10xxxxxx 10xxxxxx first byte begins with 1110,
* eight x, eight x four x, six x, six x the following bytes begin with 10.
*
* 010000-10FFFF 000xxxxx xxxxxxxx xxxxxxxx 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx First byte begins with 11110,
* five x, eight x, eight x three x, six x, six x, six x the following bytes begin with 10
*
* ====================================================================================================================
* Codepoint Codepoint binary UTF-8 binary UTF-8 hex
* hex
* --------------------------------------------------------------------------------------------------------------------
* 000000 0 0 00 # 1-byte UTF-8 encoding BEGIN
* 00007F 1111111 1111111 7F # 1-byte UTF-8 encoding END
*
* 000080 00010000000 11000010 10000000 C2 80 # 2-byte UTF-8 encoding BEGIN
* 0007FF 11111111111 11011111 10111111 DF BF # 2-byte UTF-8 encoding END
*
* 000800 0000100000000000 11100000 10100000 10000000 E0 A0 80 # 3-byte UTF-8 encoding BEGIN
* 00D7FF 1101011111111111 11101101 10011111 10111111 ED 9F BF # 3-byte UTF-8 encoding PAUSE
*
* 00D800 1101100000000000 11101101 10100000 10000000 ED A0 80 # surrogate invalid range BEGIN
* 00DFFF 1101111111111111 11101101 10111111 10111111 ED BF BF # surrogate invalid range END
*
* 00E000 1110000000000000 11101110 10000000 10000000 EE 80 80 # 3-byte UTF-8 encoding RESUME
* 00FFFF 1111111111111111 11101111 10111111 10111111 EF BF BF # 3-byte UTF-8 encoding END
*
* 010000 000010000000000000000 11110000 10010000 10000000 10000000 F0 90 80 80 # 4-byte UTF-8 encoding BEGIN
* 10FFFF 100001111111111111111 11110100 10001111 10111111 10111111 F4 8F BF BF # 4-byte UTF-8 encoding END
* ====================================================================================================================
*
*======================================================================================================================
* UTF-16 BIT DISTRIBUTION:
*======================================================================================================================
* Code range Scalar value UTF-16 Notes
* hexadecimal binary binary
* --------------------------------------------------------------------------------------------------------------------
* 000000-00FFFF xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx All code points in the Basic
* sixteen x sixteen x Multi-lingual Plane (BMP)
*
* 010000-10FFFF 000uuuuuxxxxxxxxxxxxxxxx 110110wwwwxxxxxx 110111xxxxxxxxxx All code points in the
* five u, sixteen x four w, six x, ten x Suplementary Plane (non-BMP)
*
* where wwww = uuuuu - 1
*=======================================================================================================================
*/
#define UTF8_NAME "UTF-8"
#define CHSET_M_STR "M"
#define CHSET_UTF8_STR UTF8_NAME
#define UTF8_1BYTE_MAX (unsigned)(wint_t)ASCII_MAX
#define UTF8_2BYTE_MAX (unsigned)(wint_t)0x7FF
#define UTF8_3BYTE_MAX (unsigned)(wint_t)0xFFFF
#define UTF8_4BYTE_MAX (unsigned)(wint_t)0x10FFFF
#define U_BMP_MAX (unsigned)(wint_t)0xFFFF
#define U_SURROGATE_BEGIN (unsigned)(wint_t)0xD800
#define U_HIGH_SURROGATE_BEGIN (unsigned)(wint_t)0xD800
#define U_LOW_SURROGATE_BEGIN (unsigned)(wint_t)0xDC00
#define U_SURROGATE_END (unsigned)(wint_t)0xDFFF
#define UTF8_LEAD_2BYTEMASK 0x1F /* extract out the x-s in 110xxxxx */
#define UTF8_LEAD_3BYTEMASK 0x0F /* extract out the x-s in 1110xxxx */
#define UTF8_LEAD_4BYTEMASK 0x07 /* extract out the x-s in 11110xxx */
#define UTF8_NONLEAD_BYTEMASK 0x3F /* extract out the x-s in 10xxxxxx */
#define UTF8_LEAD_2BYTE_PREFIX 0xC0 /* 110xxxxx where x is replaced with 0 */
#define UTF8_LEAD_3BYTE_PREFIX 0xE0 /* 1110xxxx where x is replaced with 0 */
#define UTF8_LEAD_4BYTE_PREFIX 0xF0 /* 11110xxx where x is replaced with 0 */
#define UTF8_NONLEAD_BYTE_PREFIX 0x80 /* 10xxxxxx where x is replaced with 0 */
#define UTF8_LEAD_2BYTE_BITLEN 5 /* number of bits extracted from the leading byte of a 2-byte UTF-8 encoding */
#define UTF8_LEAD_3BYTE_BITLEN 4 /* number of bits extracted from the leading byte of a 3-byte UTF-8 encoding */
#define UTF8_LEAD_4BYTE_BITLEN 3 /* number of bits extracted from the leading byte of a 4-byte UTF-8 encoding */
#define UTF8_NONLEAD_BITLEN 6 /* number of bits extracted from the nonleading byte of a UTF-8 encoding */
#define UTF_LINE_SEPARATOR 0x2028
#define UTF_PARA_SEPARATOR 0x2029
#define UTF8_SURROGATE_BYTELEN 3
#define GTM_MB_LEN_MAX 4 /* maximum bytes we support for multi-byte char */
#define GTM_MB_DISP_LEN_MAX 2 /* maximum number of display columns for a multi-byte char.
* all characters we know fit in a display width of 2 columns
*/
/* This macro checks if a byte is a possible valid non-leading byte in a UTF-8 byte stream */
#define UTF8_IS_VALID_TRAILING(x) (((unsigned char)(x) & 0xc0) == 0x80)
/* This macro checks if a byte is a possible valid leading byte in a UTF-8 byte stream */
#define UTF8_IS_VALID_LEADING(x) (-1 != (int)utf8_followlen[(unsigned char)(x)])
/* boolean_t U_IS_SURROGATE_CODE(wint_t c)
* Returns TRUE if the code point (c) of a character falls in the surrogate range.
* Returns 0 otherwise.
*/
#define U_IS_SURROGATE_CODE(codepoint) \
(((unsigned)(codepoint) >= U_SURROGATE_BEGIN) \
&& ((unsigned)(codepoint) <= U_SURROGATE_END))
/* boolean_t U_IS_SURROGATE_HIGH(wint_t c)
* Returns TRUE if the code point (c) of a character is a leading (high) surrogate.
* Returns 0 otherwise.
*/
#define U_IS_SURROGATE_HIGH(codepoint) \
(((unsigned)(codepoint) >= U_SURROGATE_BEGIN) \
&& ((unsigned)(codepoint) < U_LOW_SURROGATE_BEGIN))
/* boolean_t U_IS_SURROGATE_LOW(wint_t c)
* Returns TRUE if the code point (c) of a character is a trailing (low) surrogate.
* Returns 0 otherwise.
*/
#define U_IS_SURROGATE_LOW(codepoint) \
(((unsigned)(codepoint) >= U_LOW_SURROGATE_BEGIN) \
&& ((unsigned)(codepoint) <= U_SURROGATE_END))
/* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns TRUE.
* If not a valid 2-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1.
* Assumes "mbptr" is of type "uchar_ptr_t".
*/
#define UTF8_VALID_2BYTE(mbptr, bytelen) \
((UTF8_IS_VALID_TRAILING(mbptr[1])) \
? TRUE \
: (bytelen = 1, FALSE))
/* boolean_t UTF8_NONCHAR_CODE_3BYTE(wint_t c)
* boolean_t UTF8_NONCHAR_CODE_4BYTE(wint_t c)
* Each of these macros returns TRUE if the code point (c) of a 3-byte or 4-byte wide-character is noncharacter, or
* FALSE otherwise. Noncharacters are the code points that do not have valid character
* assignment. This set includes:
* U+FDD0 - U+FDEF (32 code points, all of which are 3-byte encoded)
* All U+nFFFE and U+nFFFF, for each n from 0x0 to 0x10 (total of 34 code points,
* of which U+FFFE and U+FFFF are 3-byte encoded and rest are 4-byte encoded)
*/
#define UTF8_NONCHAR_CODE_3BYTE(codepoint) \
((unsigned)(codepoint) >= 0xFDD0 \
&& ((unsigned)(codepoint) <= 0xFDEF || ((unsigned)(codepoint) & 0xFFFE) == 0xFFFE))
#define UTF8_NONCHAR_CODE_4BYTE(codepoint) \
(((unsigned)(codepoint) & 0xFFFE) == 0xFFFE)
/* boolean_t UTF8_NONCHAR_CODEPOINT(wint_t c)
* Returns TRUE if the codepoint (c) of ANY multi-byte character is a noncharacter or FALSE otherwise.
* It assumes that UTF8_NONCHAR_CODE_3BYTE macro returns the correct values for any codepoint < UTF8_3BYTE_MAX
* (including 1-byte and 2-byte codepoints).
*/
#define UTF8_NONCHAR_CODEPOINT(codepoint) \
(((unsigned)(codepoint) <= UTF8_3BYTE_MAX) && (UTF8_NONCHAR_CODE_3BYTE(codepoint)) \
|| UTF8_NONCHAR_CODE_4BYTE(codepoint))
/* boolean_t UTF8_NONCHAR_3BYTE(char* mbptr)
* boolean_t UTF8_NONCHAR_4BYTE(char* mbptr)
* Each of these macros returns TRUE if mbptr points to a noncharacter as described above
* (or FALSE otherwise), except that the checks are performed on the UTF-8 byte stream
* instead of the code points. Below are the equivalent byte patterns:
* U+FDD0 - U+FDEF:
* 0xEF 0xB7 0x90 - 0xEF 0xB7 0xAF (32 code points)
* All U+nFFFE and U+nFFFF with the last two bytes having the byte patterns:
* 0xEF 0xBF 0xBE and 0xEF 0xBF 0xBF (U+FFFE, U+FFFF)
* 0xF0 0x9F 0xBF 0xBE and 0xF0 0x9F 0xBF 0xBF (U+1FFFE, U+1FFFF)
* 0xF0 0xAF 0xBF 0xBE and 0xF0 0xAF 0xBF 0xBF (U+2FFFE, U+2FFFF)
* ....
* ....
* 0xF4 0x8F 0xBF 0xBE and 0xF4 0x8F 0xBF 0xBF (U+10FFFE, U+10FFFF)
*/
#define UTF8_NONCHAR_3BYTE(mbptr) \
(mbptr[0] == 0xEF && ((mbptr[1] == 0xB7 && ((unsigned)(mbptr[2] - 0x90) < 32)) \
|| ((mbptr[1] == 0xBF) && (mbptr[2] & 0xBE) == 0xBE)))
#define UTF8_NONCHAR_4BYTE(mbptr) \
(((mbptr[1] & 0x0F) == 0x0F) && (mbptr[2] == 0xBF) && (mbptr[3] & 0xBE) == 0xBE)
/* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns TRUE.
* If not a valid 3-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1.
* Assumes "mbptr" is of type "uchar_ptr_t".
*/
#define UTF8_VALID_3BYTE(mbptr, bytelen) \
((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) \
&& ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ \
&& ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ \
&& !UTF8_NONCHAR_3BYTE(mbptr)) /* ensure bytestream is NOT a 3-byte noncharacter */ \
? TRUE \
: (bytelen = 1, FALSE))
/* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns TRUE.
* If not a valid 4-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1.
* Assumes "mbptr" is of type "uchar_ptr_t".
*/
#define UTF8_VALID_4BYTE(mbptr, bytelen) \
((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3]) \
&& ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ \
&& ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */ \
&& !UTF8_NONCHAR_4BYTE(mbptr)) /* ensure bytestream is NOT a 4-byte noncharacter */ \
? TRUE \
: (bytelen = 1, FALSE))
/* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns (mbptr + 2) and sets codepoint appropriately.
* If not a valid 2-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF.
* Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]".
*/
#define UTF8_MBTOWC_2BYTE(mbptr, codepoint) \
((UTF8_IS_VALID_TRAILING(mbptr[1])) \
? ((codepoint = ((codepoint & UTF8_LEAD_2BYTEMASK) << UTF8_NONLEAD_BITLEN) \
| (mbptr[1] & UTF8_NONLEAD_BYTEMASK)) \
, (mbptr + 2)) \
: (codepoint = (wint_t)WEOF, (mbptr + 1)))
/* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns (mbptr + 3) and sets codepoint appropriately.
* If not a valid 3-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF.
* Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]".
*/
#define UTF8_MBTOWC_3BYTE(mbptr, codepoint) \
((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) \
&& ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ \
&& ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ \
&& !UTF8_NONCHAR_3BYTE(mbptr)) /* ensure bytestream is NOT a 3-byte noncharacter */ \
? ((codepoint = ((((codepoint & UTF8_LEAD_3BYTEMASK) \
<< UTF8_NONLEAD_BITLEN) | (mbptr[1] & UTF8_NONLEAD_BYTEMASK)) \
<< UTF8_NONLEAD_BITLEN) | (mbptr[2] & UTF8_NONLEAD_BYTEMASK)) \
, (mbptr + 3)) \
: (codepoint = (wint_t)WEOF, (mbptr + 1)))
/* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns (mbptr + 4) and sets codepoint appropriately.
* If not a valid 4-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF.
* Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]".
*/
#define UTF8_MBTOWC_4BYTE(mbptr, codepoint) \
((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3]) \
&& ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ \
&& ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */ \
&& !UTF8_NONCHAR_4BYTE(mbptr)) /* ensure bytestream is NOT a 4-byte noncharacter */ \
? ((codepoint = ((((((codepoint & UTF8_LEAD_4BYTEMASK) \
<< UTF8_NONLEAD_BITLEN) | (mbptr[1] & UTF8_NONLEAD_BYTEMASK)) \
<< UTF8_NONLEAD_BITLEN) | (mbptr[2] & UTF8_NONLEAD_BYTEMASK)) \
<< UTF8_NONLEAD_BITLEN) | (mbptr[3] & UTF8_NONLEAD_BYTEMASK)) \
, (mbptr + 4)) \
: (codepoint = (wint_t)WEOF, (mbptr + 1)))
/* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns (mbptr + 2).
* If not a valid 2-byte UTF-8 encoding, this macro returns (mbptr + 1).
* Assumes "mbptr" is of type "uchar_ptr_t".
*/
#define UTF8_MBNEXT_2BYTE(mbptr) \
(UTF8_IS_VALID_TRAILING(mbptr[1]) \
? (mbptr + 2) \
: (mbptr + 1))
/* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns (mbptr + 3).
* If not a valid 3-byte UTF-8 encoding, this macro returns (mbptr + 1).
* Assumes "mbptr" is of type "uchar_ptr_t".
*/
#define UTF8_MBNEXT_3BYTE(mbptr) \
((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) \
&& ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ \
&& ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ \
&& !UTF8_NONCHAR_3BYTE(mbptr)) /* ensure bytestream is NOT a 3-byte noncharacter */ \
? (mbptr + 3) \
: (mbptr + 1))
/* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns (mbptr + 4).
* If not a valid 4-byte UTF-8 encoding, this macro returns (mbptr + 1).
* Assumes "mbptr" is of type "uchar_ptr_t".
*/
#define UTF8_MBNEXT_4BYTE(mbptr) \
((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3]) \
&& ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ \
&& ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */ \
&& !UTF8_NONCHAR_4BYTE(mbptr)) /* ensure bytestream is NOT a 4-byte noncharacter */ \
? (mbptr + 4) \
: (mbptr + 1))
LITREF unsigned int utf8_bytelen[];
/* boolean_t UTF8_VALID(char *ptr, char *ptrend, unsigned int bytelen)
* Inspects bytes of the multi-byte UTF-8 string "ptr" upto "ptrend" and Returns TRUE if the
* byte sequence beginning at s forms a welformed and complete UTF-8 character, or FALSE
* otherwise. Sets "bytelen" to the byte length of the UTF-8 character if returning TRUE and
* to 1 if returning FALSE. A well-formed UTF-8 codepoint that is either a surrogate (in the
* range D800 - DFFF) or a noncharacter is considered invalid. This macro assumes that
* "ptrend" is at least "ptr+1" and does not do any checks on this.
*/
#define UTF8_VALID(mbptr, ptrend, bytelen) \
((bytelen) = utf8_bytelen[((uchar_ptr_t)(mbptr))[0]], \
(((((uchar_ptr_t)(mbptr))[0]) <= ASCII_MAX) ? TRUE /* ASCII. Do simplest check first. */ \
: (((bytelen) == 1) ? FALSE /* Invalid leading byte */ \
: (((int4)(bytelen) > (int4)(((uchar_ptr_t)(ptrend)) - ((uchar_ptr_t)(mbptr)))) \
? (bytelen = 1, FALSE) /* Not enough length in input string */ \
: ((bytelen) == 2 ? UTF8_VALID_2BYTE(((uchar_ptr_t)(mbptr)), (bytelen)) \
: ((bytelen) == 3 ? UTF8_VALID_3BYTE(((uchar_ptr_t)(mbptr)), (bytelen)) \
: /* bytelen == 4 */UTF8_VALID_4BYTE(((uchar_ptr_t)(mbptr)), (bytelen)) \
))))))
/* boolean_t U_VALID_CODE(wint_t codepoint)
* Returns
* TRUE if the code point of a character is a valid Unicode code point
* FALSE otherwise.
* Invalid code points include:
* All surrogate code points
* All noncharacter code points
* All code points greater than U+10FFFF
*/
#define U_VALID_CODE(codepoint) \
(((unsigned)(codepoint) <= UTF8_4BYTE_MAX) \
&& !U_IS_SURROGATE_CODE(codepoint) \
&& !UTF8_NONCHAR_CODEPOINT(codepoint))
LITREF signed int utf8_followlen[];
/* int UTF8_MBFOLLOW(char *s)
* Inspects only the first byte of a multi-byte (or even an incomplete) UTF-8 string
* pointed at s, and returns the numbers of bytes to follow in order to form a complete
* character. The possible return values by this macro are 0, 1, 2 or 3.
* If the byte stored at s does not form a legal first-byte of UTF-8 character,
* it returns -1.
*/
#define UTF8_MBFOLLOW(mbptr) (utf8_followlen[((uchar_ptr_t)(mbptr))[0]])
/* int UTF16BE_MBFOLLOW(char *mbptr, char *ptrend)
* Inspects up to two bytes of a multi-byte (or even an incomplete) UTF-16BE string
* pointed at mbptr, and returns the numbers of bytes to follow the byte at mbptr in order
* to form a complete UTF-16 character in BIG-ENDIAN format. The valid return values by
* this macro are 1 and 3. If the number of bytes between [mbptr, ptrend) is less than 2,
* the macro returns -1.
*/
#define UTF16BE_MBFOLLOW(mbptr, ptrend) \
((ptrend - mbptr >= 2) ? (UTF16BE_HIGH_SURROGATE(mbptr) ? 3 : 1) : -1)
/* int UTF16LE_MBFOLLOW(char *mbptr, char *ptrend)
* Inspects up to two bytes of a multi-byte (or even an incomplete) UTF-16LE string
* pointed at mbptr, and returns the numbers of bytes to follow the byte at mbptr in order
* to form a complete UTF-16 character in LITTLE-ENDIAN format. The valid return values by
* this macro are 1 and 3. If the number of bytes between [mbptr, ptrend) is less than 2,
* the macro returns -1.
*/
#define UTF16LE_MBFOLLOW(mbptr, ptrend) \
((ptrend - mbptr >= 2) ? (UTF16LE_HIGH_SURROGATE(mbptr) ? 3 : 1) : -1)
/* boolean_t UTF16BE_VALID(char *ptr, char *ptrend, unsigned int bytelen)
* Inspects 2 or 4 bytes of the UTF-16BE string "ptr" upto "ptrend" and Returns TRUE
* if the byte sequence beginning at ptr forms a welformed and complete UTF-16 character
* in big-endian format, or FALSE otherwise. This macro also sets "bytelen" to 2 (for
* BMP characters) or 4 (for surrogate pair).
* NOTES:
* "bytelen" is always set irrespective of the validity of the code point (eg.
* it can be set to 4 for surrogate pair for which the macro returns FALSE
* because its code point is not valid (non-character).
*
* "ptrend" is asummed to be at least "ptr+2"
*/
#define UTF16BE_VALID(mbptr, ptrend, bytelen) \
(UTF16BE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \
? (((ptrend - mbptr) >= 4 && UTF16BE_LOW_SURROGATE(mbptr + 2)) \
? ((UTF16BE_LOAD_SURROGATE(mbptr, bytelen), U_VALID_CODE(bytelen)) \
? (bytelen = 4, TRUE) : (bytelen = 4, FALSE)) \
: (bytelen = 2, FALSE)) \
: (((bytelen = UTF16BE_GET_UNIT(mbptr)), U_VALID_CODE(bytelen)) \
? (bytelen = 2, TRUE) : (bytelen = 2, FALSE)))
/* boolean_t UTF16LE_VALID(char *ptr, char *ptrend, unsigned int bytelen)
* Inspects 2 or 4 bytes of the UTF-16BE string "ptr" upto "ptrend" and Returns TRUE
* if the byte sequence beginning at ptr forms a welformed and complete UTF-16 character
* in little-endian format, or FALSE otherwise. This macro also sets "bytelen" to 2 (for
* BMP characters) or 4 (for surrogate pair).
* NOTES:
* "bytelen" is always set irrespective of the validity of the code point (eg.
* it can be set to 4 for surrogate pair for which the macro returns FALSE
* because its code point is not valid (non-character).
*
* "ptrend" is asummed to be at least "ptr+2"
*/
#define UTF16LE_VALID(mbptr, ptrend, bytelen) \
(UTF16LE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \
? (((ptrend - mbptr) >= 4 && UTF16LE_LOW_SURROGATE(mbptr + 2)) \
? ((UTF16LE_LOAD_SURROGATE(mbptr, bytelen), U_VALID_CODE(bytelen)) \
? (bytelen = 4, TRUE) : (bytelen = 4, FALSE)) \
: (bytelen = 2, FALSE)) \
: (((bytelen = UTF16LE_GET_UNIT(mbptr)), U_VALID_CODE(bytelen)) \
? (bytelen = 2, TRUE) : (bytelen = 2, FALSE)))
/* unsigned char *UTF8_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint)
* Inspects bytes of the UTF-8 string upto ptrend and sets "codepoint" to the code point of
* the next character in the string. If the bytes starting from mbptr do not form a complete
* wellformed UTF-8 character, it sets "codepoint" to WEOF. Returns (mbptr+len) where "len"
* is the byte length of the UTF-8 character found. If "codepoint" is set to WEOF, the return
* value is (mbptr+1).
*/
#define UTF8_MBTOWC(mbptr, ptrend, codepoint) \
((codepoint) = (wint_t)(((uchar_ptr_t)(mbptr))[0]), \
(((codepoint) <= ASCII_MAX) ? ((uchar_ptr_t)mbptr + 1) /* ASCII. Do simplest check first. */ \
: ((utf8_bytelen[(codepoint)] == 1) /* Invalid leading byte */ \
? ((codepoint) = (wint_t)WEOF, ((uchar_ptr_t)mbptr + 1)) \
: (((int4)utf8_bytelen[(codepoint)] /* Not enough length in input string */ \
> (int4)(((uchar_ptr_t)(ptrend)) - ((uchar_ptr_t)(mbptr)))) \
? ((codepoint) = (wint_t)WEOF, ((uchar_ptr_t)mbptr + 1)) \
: (utf8_bytelen[(codepoint)] == 2 ? UTF8_MBTOWC_2BYTE(((uchar_ptr_t)(mbptr)), (codepoint)) \
: (utf8_bytelen[(codepoint)] == 3 ? UTF8_MBTOWC_3BYTE(((uchar_ptr_t)(mbptr)), (codepoint)) \
: /* utf8_bytelen[codepoint] == 4 */UTF8_MBTOWC_4BYTE(((uchar_ptr_t)(mbptr)), (codepoint)) \
))))))
/* unsigned char* UTF8_MBNEXT(char *ptr, char *ptrend)
* Assuming that the string pointed at ptr is wellformed, it inspects bytes upto ptrend
* and advances the pointer by the number of bytes used by the character pointed at ptr.
* It returns the pointer to the beginning of the following character. If the bytes
* starting from s do not form a welformed character within the limits defined
* by ptrend, it returns the pointer to the next byte (i.e. s+1).
*/
#define UTF8_MBNEXT(mbptr, ptrend) \
(((((uchar_ptr_t)(mbptr))[0]) <= ASCII_MAX) ? ((uchar_ptr_t)mbptr + 1) /* ASCII. Do simplest check first. */ \
: ((utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 1) /* Invalid leading byte */ \
? ((uchar_ptr_t)mbptr + 1) \
: (((int4)utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] > (int4)((uchar_ptr_t)ptrend - (uchar_ptr_t)mbptr)) \
? ((uchar_ptr_t)mbptr + 1) /* Not enough length in input string */ \
: (utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 2 ? UTF8_MBNEXT_2BYTE(((uchar_ptr_t)(mbptr))) \
: (utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 3 ? UTF8_MBNEXT_3BYTE(((uchar_ptr_t)(mbptr))) \
: /* utf8_bytelen[(((uchar_ptr_t)mbptr)[0])] == 4 */UTF8_MBNEXT_4BYTE(((uchar_ptr_t)(mbptr))) \
)))))
/* unsigned char* UTF8_WCTOMB(wint_t c, char *s)
* Converts the code point of a character (c) to a sequence of bytes and stores
* the result (of 1 to 4 bytes long) at the beginning of the character array pointed
* to by s. It returns the pointer advanced by the number of bytes required for c.
* For invalid code points no conversion is done and and the macro returns s.
*/
#define UTF8_WCTOMB(codepoint, mbptr) \
(((unsigned)(codepoint) <= UTF8_1BYTE_MAX) /* 1-byte UTF-8 encoding */ \
? (*((uchar_ptr_t)mbptr) = (unsigned char)(codepoint), ((uchar_ptr_t)mbptr) + 1) \
: (((unsigned)(codepoint) <= UTF8_2BYTE_MAX) /* 2-byte UTF-8 encoding */ \
? (*(((uchar_ptr_t)mbptr) + 1) \
= (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX), \
*((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> UTF8_NONLEAD_BITLEN) | UTF8_LEAD_2BYTE_PREFIX), \
((uchar_ptr_t)mbptr) + 2) \
: (((unsigned)(codepoint) <= UTF8_3BYTE_MAX) /* 3-byte UTF-8 encoding */ \
? ((U_IS_SURROGATE_CODE(codepoint) || UTF8_NONCHAR_CODE_3BYTE(codepoint)) \
? ((uchar_ptr_t)mbptr) /* Surrogate or noncharacter (3-byte case) */ \
: (*(((uchar_ptr_t)mbptr) + 2) /* Non-surrogate 3-byte case */ \
= (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX), \
*(((uchar_ptr_t)mbptr) + 1) \
= (unsigned char)((((codepoint) >> UTF8_NONLEAD_BITLEN) & UTF8_NONLEAD_BYTEMASK) \
| UTF8_NONLEAD_BYTE_PREFIX), \
*((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> (2 * UTF8_NONLEAD_BITLEN)) \
| UTF8_LEAD_3BYTE_PREFIX), \
((uchar_ptr_t)mbptr) + 3)) \
: ((((unsigned)(codepoint) <= UTF8_4BYTE_MAX) && !UTF8_NONCHAR_CODE_4BYTE(codepoint)) /* 4-byte UTF-8 encoding */ \
? (*(((uchar_ptr_t)mbptr) + 3) \
= (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX), \
*(((uchar_ptr_t)mbptr) + 2) \
= (unsigned char)((((codepoint) >> UTF8_NONLEAD_BITLEN) & UTF8_NONLEAD_BYTEMASK) \
| UTF8_NONLEAD_BYTE_PREFIX), \
*(((uchar_ptr_t)mbptr) + 1) \
= (unsigned char)((((codepoint) >> (2 * UTF8_NONLEAD_BITLEN)) & UTF8_NONLEAD_BYTEMASK) \
| UTF8_NONLEAD_BYTE_PREFIX), \
*((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> (3 * UTF8_NONLEAD_BITLEN)) \
| UTF8_LEAD_4BYTE_PREFIX), \
((uchar_ptr_t)mbptr) + 4) \
: ((uchar_ptr_t)mbptr)))))
/* boolean_t UTF8_SURROGATE(char* s, char *ptrend)
* Inspects bytes of the multi-byte UTF-8 string upto ptrend and Returns TRUE if the
* byte sequence beginning at s forms a welformed UTF-8 character and an
* isolated surrogate character (either lower surrogate or upper surrogate).
* It returns FALSE, otherwise.
*/
#define UTF8_SURROGATE(mbptr, ptrend) \
(((UTF8_SURROGATE_BYTELEN /* maxlen should be at least 3-bytes */ \
<= ((int4)((uchar_ptr_t)ptrend - (uchar_ptr_t)mbptr))) \
&& (((uchar_ptr_t)mbptr)[0] == 0xED) /* leading byte should be 0xED for surrogate UTF-8 */ \
&& (((uchar_ptr_t)mbptr)[1] >= 0xA0) /* first non-leading byte should be at least 0xA0 */ \
&& (((uchar_ptr_t)mbptr)[1] <= 0xBF) /* first non-leading byte should be at most 0xBF */ \
&& (UTF8_IS_VALID_TRAILING(((uchar_ptr_t)mbptr)[2]))) /* second non-leading byte should be valid */ \
? TRUE : FALSE)
/* void UTF8_LEADING_BYTE(char* mbptr, char* baseptr, char* leadptr)
* Sets leadptr to point to the leading byte of the UTF-8 character containing the byte
* pointed by mbptr. If the byte pointed by mbptr is not part of a valid UTF-8 character,
* this macro sets leadptr to mbptr.
* NOTE: mbptr and leadptr must not be the same variable.
*/
#define UTF8_LEADING_BYTE(mbptr, baseptr, leadptr) \
{ \
leadptr = mbptr; \
while (leadptr >= baseptr && UTF8_IS_VALID_TRAILING(*(uchar_ptr_t)leadptr)) \
--leadptr; \
if (leadptr < baseptr || !UTF8_IS_VALID_LEADING(*(uchar_ptr_t)leadptr) || \
(mbptr - leadptr) > utf8_followlen[*(uchar_ptr_t)leadptr]) \
leadptr = mbptr; \
}
/* Macros to return the UTF-16 (16-bit) code units from a given code point in the supplementary plane.
* Note: these macros must be called only for the supplementary code points (> U_BMP_MAX) that are <= UTF8_4BYTE_MAX */
#define UTF16_HIGH_SURROGATE(codepoint) \
(U_HIGH_SURROGATE_BEGIN | ((((codepoint) >> 16) - 1) << 6) | (((codepoint) >> 10) & 0x3F))
#define UTF16_LOW_SURROGATE(codepoint) \
(U_LOW_SURROGATE_BEGIN | ((codepoint) & 0x3FF))
/* Composes a surrogate pair and returns the code point in the supplementary plane */
#define UTF16_COMPOSE_SURROGATES(high, low) \
((((((high) >> 6) & 0xF) + 1) << 16) | (((high) & 0x3F) << 10) | ((low) & 0x3FF))
/* Macros to convert a UTF-16 (16-bit) code unit into a 2-byte sequence in the appropriate endianness.
* The codeunits passed must be less than or equal to U_BMP_MAX */
#define UTF16BE_STORE_UNIT(mbptr, codeunit) \
((((uchar_ptr_t)mbptr)[0] = ((codeunit) >> 8)), (((uchar_ptr_t)mbptr)[1] = ((codeunit) & 0x00FF)))
#define UTF16LE_STORE_UNIT(mbptr, codeunit) \
((((uchar_ptr_t)mbptr)[1] = ((codeunit) >> 8)), (((uchar_ptr_t)mbptr)[0] = ((codeunit) & 0x00FF)))
/* macros to return a single UTF-16 (16-bit) codeunit given a 2-byte sequence */
#define UTF16BE_GET_UNIT(mbptr) \
((((uchar_ptr_t)mbptr)[0] << 8) | ((uchar_ptr_t)mbptr)[1])
#define UTF16LE_GET_UNIT(mbptr) \
((((uchar_ptr_t)mbptr)[1] << 8) | ((uchar_ptr_t)mbptr)[0])
/* macros to load UTF-16 surrogate codeunit pairs and return the code point in the supplementary plane.
* Note: mbptr must point to a valid 4-byte sequence of high and low surrogates */
#define UTF16BE_LOAD_SURROGATE(mbptr, codepoint) \
(codepoint = UTF16BE_GET_UNIT(mbptr), \
codepoint = UTF16_COMPOSE_SURROGATES(codepoint, UTF16BE_GET_UNIT(mbptr+2)))
#define UTF16LE_LOAD_SURROGATE(mbptr, codepoint) \
(codepoint = UTF16LE_GET_UNIT(mbptr), \
codepoint = UTF16_COMPOSE_SURROGATES(codepoint, UTF16LE_GET_UNIT(mbptr+2)))
/* char* UTF16BE_WCTOMB(wint_t codepoint, char *mbptr)
* Converts the code point of a character (codepoint) in to big-endian UTF-16 bytes
* and stores the result (of 2 or 4 bytes long) at the beginning of the character
* array pointed to by mbptr. It returns the pointer advanced by the number of bytes
* required for codepoint. For invalid code points, no conversion is done and and
* the macro returns mbptr.
*/
#define UTF16BE_WCTOMB(codepoint, mbptr) \
(U_VALID_CODE(codepoint) \
? ((codepoint) <= U_BMP_MAX \
? (UTF16BE_STORE_UNIT(mbptr, (codepoint)), mbptr + 2) /* code points in BMP */ \
: (UTF16BE_STORE_UNIT(mbptr, UTF16_HIGH_SURROGATE(codepoint)), /* supplementary plane */\
UTF16BE_STORE_UNIT(mbptr + 2, UTF16_LOW_SURROGATE(codepoint)), mbptr + 4)) \
: mbptr)
/* char* UTF16LE_WCTOMB(wint_t codepoint, char *mbptr)
* Converts the code point of a character (codepoint) in to little-endian UTF-16 bytes
* and stores the result (of 2 or 4 bytes long) at the beginning of the character
* array pointed to by mbptr. It returns the pointer advanced by the number of bytes
* required for codepoint. For invalid code points, no conversion is done and and
* the macro returns mbptr.
*/
#define UTF16LE_WCTOMB(codepoint, mbptr) \
(U_VALID_CODE(codepoint) \
? ((codepoint) <= U_BMP_MAX /* 16-bit characters */ \
? (UTF16LE_STORE_UNIT(mbptr, (codepoint)), mbptr + 2) /* code points in BMP */ \
: (UTF16LE_STORE_UNIT(mbptr, UTF16_HIGH_SURROGATE(codepoint)), /* supplementary plane */\
UTF16LE_STORE_UNIT(mbptr + 2, UTF16_LOW_SURROGATE(codepoint)), mbptr + 4)) \
: mbptr)
/* char *UTF16BE_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint)
* Inspects 2 bytes (or 4 bytes if surrogates) of the UTF-16 string in big-endian and
* sets "codepoint" to the code point of the next character in the string. Returns
* (mbptr + len) where "len" is the byte length of the UTF-16BE character found. If
* the bytes starting from mbptr do not form a complete welformed UTF-16BE character,
* it sets codepoint to WEOF and return mbptr.
*/
#define UTF16BE_MBTOWC(mbptr, ptrend, codepoint) \
((UTF16BE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \
? (((ptrend - mbptr) >= 4 && UTF16BE_LOW_SURROGATE(mbptr + 2)) \
? UTF16BE_LOAD_SURROGATE(mbptr, codepoint) : (codepoint = WEOF)) \
: (codepoint = UTF16BE_GET_UNIT(mbptr))), \
(U_VALID_CODE(codepoint) /* validate the code point */ \
? ((codepoint) <= U_BMP_MAX ? (mbptr + 2) : (mbptr + 4)) \
: (((codepoint) = WEOF), mbptr)))
/* char *UTF16LE_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint)
* Inspects 2 bytes (or 4 bytes if surrogates) of the UTF-16 string in little-endian and
* sets "codepoint" to the code point of the next character in the string. Returns
* (mbptr + len) where "len" is the byte length of the UTF-16BE character found. If
* the bytes starting from mbptr do not form a complete welformed UTF-16LE character,
* it sets codepoint to WEOF and return mbptr.
*/
#define UTF16LE_MBTOWC(mbptr, ptrend, codepoint) \
((UTF16LE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \
? (((ptrend - mbptr) >= 4 && UTF16LE_LOW_SURROGATE(mbptr + 2)) \
? UTF16LE_LOAD_SURROGATE(mbptr, codepoint) : (codepoint = WEOF)) \
: (codepoint = UTF16LE_GET_UNIT(mbptr))), \
(U_VALID_CODE(codepoint) /* validate the code point */ \
? ((codepoint) <= U_BMP_MAX ? (mbptr + 2) : (mbptr + 4)) \
: (((codepoint) = WEOF), mbptr)))
/* boolean_t UTF16BE_HIGH_SURROGATE(char* mbptr)
* Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
* beginning at mbptr forms a welformed UTF-16BE high surrogate character (U+D800 - U+DBFF)
* and FALSE otherwise.
*/
#define UTF16BE_HIGH_SURROGATE(mbptr) \
(U_IS_SURROGATE_HIGH(UTF16BE_GET_UNIT(mbptr)))
/* boolean_t UTF16LE_HIGH_SURROGATE(char* mbptr)
* Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
* beginning at mbptr forms a welformed UTF-16LE high surrogate character (U+D800 - U+DBFF)
* and FALSE otherwise.
*/
#define UTF16LE_HIGH_SURROGATE(mbptr) \
(U_IS_SURROGATE_HIGH(UTF16LE_GET_UNIT(mbptr)))
/* boolean_t UTF16BE_LOW_SURROGATE(char* mbptr)
* Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
* beginning at mbptr forms a welformed UTF-16BE low surrogate character (U+DC00 - U+DFFF)
* and FALSE otherwise.
*/
#define UTF16BE_LOW_SURROGATE(mbptr) \
(U_IS_SURROGATE_LOW(UTF16BE_GET_UNIT(mbptr)))
/* boolean_t UTF16LE_LOW_SURROGATE(char* mbptr)
* Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence
* beginning at mbptr forms a welformed UTF-16LE low surrogate character (U+DC00 - U+DFFF)
* and FALSE otherwise.
*/
#define UTF16LE_LOW_SURROGATE(mbptr) \
(U_IS_SURROGATE_LOW(UTF16LE_GET_UNIT(mbptr)))
/* The following macros provide the character classification for Unicode characters given their code points */
#define U_ISLOWER(c) u_islower(c)
#define U_ISUPPER(c) u_isupper(c)
#define U_ISALPHA(c) u_isalpha(c)
#define U_ISCNTRL(c) u_iscntrl(c)
#define U_ISDIGIT(c) u_isdigit(c)
#define U_ISPUNCT(c) u_ispunct(c)
#define U_ISSPACE(c) u_isspace(c)
#define U_ISBLANK(c) u_isblank(c)
#define U_ISGRAPH(c) u_isgraph(c)
#define U_ISPRINT(c) GTM_U_ISPRINT(c) /* see macro definition for why redirection needed */
#define U_ISTITLE(c) u_istitle(c)
#define U_CHARTYPE(c) u_charType(c)
/* uint4 CTYPEMASK(wint_t c)
*
* This macro assumes that "c" is a valid unicode codepoint.
*
* Returns a patcode from a code point (wide character wint_t) paralleling the way ICU library functions classify codepoints.
* u_isalpha (for A)
* u_isdigit (for N)
* u_ispunct (for P)
* u_iscntrl (for C)
* But with the following adjustments.
* 1) If $ZPATNUMERIC is not "UTF-8", non-ASCII decimal digits are classified as A.
* 2) Non-decimal digits (Nl and No) are classified as A. Note: u_isdigit only matches decimal digits.
* 3) Anything left is classified via u_isprint into either P or C. Note: u_isprint only matches non-control characters.
* Note that the ISV $ZPATN[UMERIC] dictates how the pattern class N used in the pattern match operator is interpreted.
* If $ZPATNUMERIC is "UTF-8", the pattern class N matches any decimal numeric character as defined by the Unicode standard.
* If $ZPATNUMERIC is "M", GT.M restricts the pattern class N to match only ASCII digits 0-9 (i.e. ASCII 48-57).
* The variable "utf8_patnumeric" is TRUE if $ZPATNUMERIC is "UTF-8".
*
* The above rules result in the following mapping
* --------------------------------------------------
* Unicode general category GT.M patcode class
* --------------------------------------------------
* L* (all letters) -> A
* M* (all marks) -> P
* Nd (decimal numbers) -> N (if decimal digit is ASCII or $ZPATNUMERIC is "UTF-8", otherwise -> A)
* Nl (letter numbers) -> A (examples of Nl are Roman numerals)
* No (other numbers) -> A (examples of No are fractions)
* P* (all punctuation) -> P
* S* (all symbols) -> P
* Zs (spaces) -> P
* Zl (line separators) -> C
* Zp (paragraph separators) -> C
* C* (all control codepoints) -> C
*
* For a description of the Unicode general categories see http://unicode.org/versions/Unicode4.0.0/ch04.pdf (section 4.5)
*
* E = A + P + N + C and the classifications A, P, N, and C are mutually exclusive.
*
* This means that PATM_UTF8_NONBASIC does not currently have any codepoints mapped to it.
* It is being retained in case it is needed in the future.
*/
/* our mask to map non-decimal digits into the PATM_A */
#define GTM_NA_MASK (U_GC_NL_MASK | U_GC_NO_MASK)
#define CTYPEMASK(c) \
(U_ISALPHA(c) ? /* alphabet */ \
(U_ISLOWER(c) ? PATM_L /* lower-case */ \
: (U_ISUPPER(c) ? PATM_U /* upper-case */ \
: PATM_UTF8_ALPHABET)) /* unicode alphabet that is neither lower nor upper case */ \
: (U_ISDIGIT(c) /* ascii or non-ascii decimal digit */ \
? ((utf8_patnumeric || IS_ASCII(c)) /* check $ZPATNUMERIC setting */ \
? PATM_N /* Ascii digit OR $ZPATNUMERIC set to "UTF-8" */ \
: PATM_UTF8_ALPHABET) /* $ZPATNUMERIC set to "M" and non-ascii decimal digit */ \
: ((U_MASK(U_CHARTYPE(c)) & GTM_NA_MASK)/* put non-decimal digits in */ \
? PATM_UTF8_ALPHABET /* PATM_UTF8_ALPHABET */ \
: (U_ISPUNCT(c) ? PATM_P /* punctuation */ \
:(U_ISCNTRL(c) ? PATM_C /* control */ \
/* unicode character that is not part of any basic class */ \
:(U_ISPRINT(c) /* if printable */ \
? PATM_P /* punctuation */ \
: PATM_C)))))) /* otherwise, control */
/* uint4 TYPEMASK(char *ptr, char *ptrend, char *ptrnext, wint_t codepoint)
* Inspects bytes of a character (in UTF-8 format) starting at "ptr" upto "ptrend", and returns its patcode.
* This macro is a replacment to the existing typemask[] table that works in both UTF-8 and non-UTF8 mode.
* This macro should only be used by the compiler. This macro assumes that "gtm_utf8_mode" is TRUE.
* The parameter "codepoint" is set to the codepoint which is the same thing that gets returned by the macro.
*/
#define TYPEMASK(ptr, ptrend, ptrnext, codepoint) \
(IS_ASCII(*(ptr)) \
? ((ptrnext = ptr + 1), (codepoint) = *(ptr), typemask[*(ptr)]) \
: (ptrnext = UTF8_MBTOWC(ptr, ptrend, codepoint), CTYPEMASK(codepoint)))
/* uint4 PATTERN_TYPEMASK(char *ptr, char *ptrend, char *ptrnext, wint_t codepoint)
* Inspects bytes of a character (in UTF-8 format) starting at "ptr" upto "ptrend", and returns its patcode.
* This macro is a replacment to the existing pattern_typemask[] table that works in both UTF-8 and non-UTF8 mode.
* This macro should only be used by the runtime. This macro assumes that "gtm_utf8_mode" is TRUE.
* The parameter "codepoint" is set to the codepoint if it is a multi-byte UTF8 character.
*/
#define PATTERN_TYPEMASK(ptr, ptrend, ptrnext, codepoint) \
(IS_ASCII(*(ptr)) \
? ((ptrnext = ptr + 1), (codepoint) = *(ptr), pattern_typemask[*(ptr)]) \
: (ptrnext = UTF8_MBTOWC(ptr, ptrend, codepoint), CTYPEMASK(codepoint)))
/* Returns the display column width of a character given its code point. This macro
* returns -1 for control characters and 0 for non-spacing (combining) characters
*/
#define UTF8_WCWIDTH(c) gtm_wcwidth((wint_t)(c))
/* The following macro is same as UTF8_WCWIDTH except that it returns 0 for unprintable valid characters as well.
* It is primarily used by the IO code.
*/
#ifdef UNICODE_SUPPORTED
#define GTM_IO_WCWIDTH(CHAR,RET) \
if (utf8_active) \
{ \
RET = UTF8_WCWIDTH(CHAR); \
RET = (0 > RET ? 0 : RET); \
} else \
RET = 1
#else
#define GTM_IO_WCWIDTH(CHAR,RET) RET = 1
#endif
/* Offsets for use with u32_line_term[] */
#define U32_LT_LF 0
#define U32_LT_CR 1
#define U32_LT_NL 2
#define U32_LT_FF 3
#define U32_LT_LS 4
#define U32_LT_PS 5
#define U32_LT_LAST 5 /* not counting null sentinel */
#ifdef UNICODE_SUPPORTED
#include "gtm_icu_api.h"
int trim_U16_line_term(UChar *buffer, int len);
#endif
/* There could be integral promotion/sign extension issues if short, int (or an integral type) is used for comparison. Avoid
* such issues by definining BOM as a string
*/
#define UTF16BE_BOM "\xFE\xFF" /* Big Endian BYTE ORDER MARKER */
#define UTF16BE_BOM_LEN STR_LIT_LEN(UTF16BE_BOM)
#define UTF16LE_BOM "\xFF\xFE" /* Little Endian BYTE ORDER MARKER */
#define UTF16LE_BOM_LEN STR_LIT_LEN(UTF16LE_BOM)
#define UTF8_BOM "\xEF\xBB\xBF" /* No relevance to endian-ness, a UTF8 MARKER similar to UTF16_BOM */
#define UTF8_BOM_LEN STR_LIT_LEN(UTF8_BOM)
#define UTF32BE_BOM "\x00\x00\xFE\xFF" /* Big Endian BYTE ORDER MARKER */
#define UTF32BE_BOM_LEN STR_LIT_LEN(UTF32BE_BOM)
#define UTF32LE_BOM "\xFF\xFE\x00\x00" /* Little Endian BYTE ORDER MARKER */
#define UTF32LE_BOM_LEN STR_LIT_LEN(UTF32LE_BOM)
#define BOM_CODEPOINT 0xFEFF
#define UTF8_BADCHAR(len, str, strtop, chset_len, chset) \
utf8_badchar((len), (unsigned char *)(str), (unsigned char *)(strtop), (chset_len), (unsigned char *)(chset))
#define UTF8_BADCHAR_STX(len, str, strtop, chset_len, chset) \
utf8_badchar_stx((len), (unsigned char *)(str), (unsigned char *)(strtop), (chset_len), (unsigned char *)(chset))
#define UTF8_LEN_STRICT(ptr, len) \
utf8_len_strict((unsigned char *)(ptr), (len))
/* This macro is needed to to ensure all Unicode line terminators are considered non-printable. As of this
* writing, ICU's u_isprint returns TRUE for LS/PS (Line/Paragraph separator; codepoints 0x2028, 0x2029)
* and this causes problems in extracting and loading data which contains these codepoints (in UTF8 mode).
* Ideally, one should go through all the line terminators in u32_line_term[] array and check them.
* But since this routine is performance intensive (called from mupip extract which can take hours for
* huge databases) we avoid a loop and just check for LS/PS which we know dont work right with "u_isprint"
*/
#define GTM_U_ISPRINT(code) \
((((UChar32)UTF_LINE_SEPARATOR == (UChar32)(code)) || ((UChar32)UTF_PARA_SEPARATOR == (UChar32)(code))) \
? FALSE \
: u_isprint(code))
GBLREF boolean_t utf8_patnumeric;
int utf8_len(mstr* str);
int utf8_len_stx(mstr* str);
int utf8_len_strict(unsigned char* ptr, int len);
int gtm_wcwidth(wint_t code);
int gtm_wcswidth(unsigned char* ptr, int len, boolean_t strict, int nonprintwidth);
void utf8_badchar(int len, unsigned char* str, unsigned char *strtop, int chset_len, unsigned char* chset);
void utf8_badchar_stx(int len, unsigned char* str, unsigned char *strtop, int chset_len, unsigned char* chset);
unsigned char *gtm_utf8_trim_invalid_tail(unsigned char *str, int len);
/* To prevent GTMSECSHR from pulling in the function "gtmwcswidth" (used in util_output.c) and in turn the entire Unicode
* codebase, we define a function-pointer variable and initialize it at startup to NULL only in GTMSECSHR and not-null
* in all the other executables.
*/
typedef int (*gtm_wcswidth_fnptr_t)(unsigned char* ptr, int len, boolean_t strict, int nonprintwidth);
GBLREF gtm_wcswidth_fnptr_t gtm_wcswidth_fnptr; /* see comment in gtm_utf8.h about this typedef */
#endif /* GTM_UTF8_H */