/**************************************************************** * * * Copyright 2006, 2010 Fidelity Information Services, Inc.* * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #ifndef GTM_UTF8_H #define GTM_UTF8_H #include #include /* *======================================================================================================================= * UTF-8 BIT DISTRIBUTION: *======================================================================================================================= * Code range Scalar value UTF-8 Notes * hexadecimal binary binary * ----------------------------------------------------------------------------------------------------------------------- * 000000-00007F 0xxxxxxx 0xxxxxxx ASCII equivalence range; * seven x seven x byte begins with zero * * 000080-0007FF 00000xxx xxxxxxxx 110xxxxx 10xxxxxx first byte begins with 110, * three x, eight x five x, six x the following byte begins with 10. * * 000800-00FFFF xxxxxxxx xxxxxxxx 1110xxxx 10xxxxxx 10xxxxxx first byte begins with 1110, * eight x, eight x four x, six x, six x the following bytes begin with 10. * * 010000-10FFFF 000xxxxx xxxxxxxx xxxxxxxx 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx First byte begins with 11110, * five x, eight x, eight x three x, six x, six x, six x the following bytes begin with 10 * * ==================================================================================================================== * Codepoint Codepoint binary UTF-8 binary UTF-8 hex * hex * -------------------------------------------------------------------------------------------------------------------- * 000000 0 0 00 # 1-byte UTF-8 encoding BEGIN * 00007F 1111111 1111111 7F # 1-byte UTF-8 encoding END * * 000080 00010000000 11000010 10000000 C2 80 # 2-byte UTF-8 encoding BEGIN * 0007FF 11111111111 11011111 10111111 DF BF # 2-byte UTF-8 encoding END * * 000800 0000100000000000 11100000 10100000 10000000 E0 A0 80 # 3-byte UTF-8 encoding BEGIN * 00D7FF 1101011111111111 11101101 10011111 10111111 ED 9F BF # 3-byte UTF-8 encoding PAUSE * * 00D800 1101100000000000 11101101 10100000 10000000 ED A0 80 # surrogate invalid range BEGIN * 00DFFF 1101111111111111 11101101 10111111 10111111 ED BF BF # surrogate invalid range END * * 00E000 1110000000000000 11101110 10000000 10000000 EE 80 80 # 3-byte UTF-8 encoding RESUME * 00FFFF 1111111111111111 11101111 10111111 10111111 EF BF BF # 3-byte UTF-8 encoding END * * 010000 000010000000000000000 11110000 10010000 10000000 10000000 F0 90 80 80 # 4-byte UTF-8 encoding BEGIN * 10FFFF 100001111111111111111 11110100 10001111 10111111 10111111 F4 8F BF BF # 4-byte UTF-8 encoding END * ==================================================================================================================== * *====================================================================================================================== * UTF-16 BIT DISTRIBUTION: *====================================================================================================================== * Code range Scalar value UTF-16 Notes * hexadecimal binary binary * -------------------------------------------------------------------------------------------------------------------- * 000000-00FFFF xxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxx All code points in the Basic * sixteen x sixteen x Multi-lingual Plane (BMP) * * 010000-10FFFF 000uuuuuxxxxxxxxxxxxxxxx 110110wwwwxxxxxx 110111xxxxxxxxxx All code points in the * five u, sixteen x four w, six x, ten x Suplementary Plane (non-BMP) * * where wwww = uuuuu - 1 *======================================================================================================================= */ #define UTF8_NAME "UTF-8" #define CHSET_M_STR "M" #define CHSET_UTF8_STR UTF8_NAME #define UTF8_1BYTE_MAX (unsigned)(wint_t)ASCII_MAX #define UTF8_2BYTE_MAX (unsigned)(wint_t)0x7FF #define UTF8_3BYTE_MAX (unsigned)(wint_t)0xFFFF #define UTF8_4BYTE_MAX (unsigned)(wint_t)0x10FFFF #define U_BMP_MAX (unsigned)(wint_t)0xFFFF #define U_SURROGATE_BEGIN (unsigned)(wint_t)0xD800 #define U_HIGH_SURROGATE_BEGIN (unsigned)(wint_t)0xD800 #define U_LOW_SURROGATE_BEGIN (unsigned)(wint_t)0xDC00 #define U_SURROGATE_END (unsigned)(wint_t)0xDFFF #define UTF8_LEAD_2BYTEMASK 0x1F /* extract out the x-s in 110xxxxx */ #define UTF8_LEAD_3BYTEMASK 0x0F /* extract out the x-s in 1110xxxx */ #define UTF8_LEAD_4BYTEMASK 0x07 /* extract out the x-s in 11110xxx */ #define UTF8_NONLEAD_BYTEMASK 0x3F /* extract out the x-s in 10xxxxxx */ #define UTF8_LEAD_2BYTE_PREFIX 0xC0 /* 110xxxxx where x is replaced with 0 */ #define UTF8_LEAD_3BYTE_PREFIX 0xE0 /* 1110xxxx where x is replaced with 0 */ #define UTF8_LEAD_4BYTE_PREFIX 0xF0 /* 11110xxx where x is replaced with 0 */ #define UTF8_NONLEAD_BYTE_PREFIX 0x80 /* 10xxxxxx where x is replaced with 0 */ #define UTF8_LEAD_2BYTE_BITLEN 5 /* number of bits extracted from the leading byte of a 2-byte UTF-8 encoding */ #define UTF8_LEAD_3BYTE_BITLEN 4 /* number of bits extracted from the leading byte of a 3-byte UTF-8 encoding */ #define UTF8_LEAD_4BYTE_BITLEN 3 /* number of bits extracted from the leading byte of a 4-byte UTF-8 encoding */ #define UTF8_NONLEAD_BITLEN 6 /* number of bits extracted from the nonleading byte of a UTF-8 encoding */ #define UTF_LINE_SEPARATOR 0x2028 #define UTF_PARA_SEPARATOR 0x2029 #define UTF8_SURROGATE_BYTELEN 3 #define GTM_MB_LEN_MAX 4 /* maximum bytes we support for multi-byte char */ #define GTM_MB_DISP_LEN_MAX 2 /* maximum number of display columns for a multi-byte char. * all characters we know fit in a display width of 2 columns */ /* This macro checks if a byte is a possible valid non-leading byte in a UTF-8 byte stream */ #define UTF8_IS_VALID_TRAILING(x) (((unsigned char)(x) & 0xc0) == 0x80) /* This macro checks if a byte is a possible valid leading byte in a UTF-8 byte stream */ #define UTF8_IS_VALID_LEADING(x) (-1 != (int)utf8_followlen[(unsigned char)(x)]) /* boolean_t U_IS_SURROGATE_CODE(wint_t c) * Returns TRUE if the code point (c) of a character falls in the surrogate range. * Returns 0 otherwise. */ #define U_IS_SURROGATE_CODE(codepoint) \ (((unsigned)(codepoint) >= U_SURROGATE_BEGIN) \ && ((unsigned)(codepoint) <= U_SURROGATE_END)) /* boolean_t U_IS_SURROGATE_HIGH(wint_t c) * Returns TRUE if the code point (c) of a character is a leading (high) surrogate. * Returns 0 otherwise. */ #define U_IS_SURROGATE_HIGH(codepoint) \ (((unsigned)(codepoint) >= U_SURROGATE_BEGIN) \ && ((unsigned)(codepoint) < U_LOW_SURROGATE_BEGIN)) /* boolean_t U_IS_SURROGATE_LOW(wint_t c) * Returns TRUE if the code point (c) of a character is a trailing (low) surrogate. * Returns 0 otherwise. */ #define U_IS_SURROGATE_LOW(codepoint) \ (((unsigned)(codepoint) >= U_LOW_SURROGATE_BEGIN) \ && ((unsigned)(codepoint) <= U_SURROGATE_END)) /* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns TRUE. * If not a valid 2-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1. * Assumes "mbptr" is of type "uchar_ptr_t". */ #define UTF8_VALID_2BYTE(mbptr, bytelen) \ ((UTF8_IS_VALID_TRAILING(mbptr[1])) \ ? TRUE \ : (bytelen = 1, FALSE)) /* boolean_t UTF8_NONCHAR_CODE_3BYTE(wint_t c) * boolean_t UTF8_NONCHAR_CODE_4BYTE(wint_t c) * Each of these macros returns TRUE if the code point (c) of a 3-byte or 4-byte wide-character is noncharacter, or * FALSE otherwise. Noncharacters are the code points that do not have valid character * assignment. This set includes: * U+FDD0 - U+FDEF (32 code points, all of which are 3-byte encoded) * All U+nFFFE and U+nFFFF, for each n from 0x0 to 0x10 (total of 34 code points, * of which U+FFFE and U+FFFF are 3-byte encoded and rest are 4-byte encoded) */ #define UTF8_NONCHAR_CODE_3BYTE(codepoint) \ ((unsigned)(codepoint) >= 0xFDD0 \ && ((unsigned)(codepoint) <= 0xFDEF || ((unsigned)(codepoint) & 0xFFFE) == 0xFFFE)) #define UTF8_NONCHAR_CODE_4BYTE(codepoint) \ (((unsigned)(codepoint) & 0xFFFE) == 0xFFFE) /* boolean_t UTF8_NONCHAR_CODEPOINT(wint_t c) * Returns TRUE if the codepoint (c) of ANY multi-byte character is a noncharacter or FALSE otherwise. * It assumes that UTF8_NONCHAR_CODE_3BYTE macro returns the correct values for any codepoint < UTF8_3BYTE_MAX * (including 1-byte and 2-byte codepoints). */ #define UTF8_NONCHAR_CODEPOINT(codepoint) \ (((unsigned)(codepoint) <= UTF8_3BYTE_MAX) && (UTF8_NONCHAR_CODE_3BYTE(codepoint)) \ || UTF8_NONCHAR_CODE_4BYTE(codepoint)) /* boolean_t UTF8_NONCHAR_3BYTE(char* mbptr) * boolean_t UTF8_NONCHAR_4BYTE(char* mbptr) * Each of these macros returns TRUE if mbptr points to a noncharacter as described above * (or FALSE otherwise), except that the checks are performed on the UTF-8 byte stream * instead of the code points. Below are the equivalent byte patterns: * U+FDD0 - U+FDEF: * 0xEF 0xB7 0x90 - 0xEF 0xB7 0xAF (32 code points) * All U+nFFFE and U+nFFFF with the last two bytes having the byte patterns: * 0xEF 0xBF 0xBE and 0xEF 0xBF 0xBF (U+FFFE, U+FFFF) * 0xF0 0x9F 0xBF 0xBE and 0xF0 0x9F 0xBF 0xBF (U+1FFFE, U+1FFFF) * 0xF0 0xAF 0xBF 0xBE and 0xF0 0xAF 0xBF 0xBF (U+2FFFE, U+2FFFF) * .... * .... * 0xF4 0x8F 0xBF 0xBE and 0xF4 0x8F 0xBF 0xBF (U+10FFFE, U+10FFFF) */ #define UTF8_NONCHAR_3BYTE(mbptr) \ (mbptr[0] == 0xEF && ((mbptr[1] == 0xB7 && ((unsigned)(mbptr[2] - 0x90) < 32)) \ || ((mbptr[1] == 0xBF) && (mbptr[2] & 0xBE) == 0xBE))) #define UTF8_NONCHAR_4BYTE(mbptr) \ (((mbptr[1] & 0x0F) == 0x0F) && (mbptr[2] == 0xBF) && (mbptr[3] & 0xBE) == 0xBE) /* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns TRUE. * If not a valid 3-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1. * Assumes "mbptr" is of type "uchar_ptr_t". */ #define UTF8_VALID_3BYTE(mbptr, bytelen) \ ((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) \ && ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ \ && ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ \ && !UTF8_NONCHAR_3BYTE(mbptr)) /* ensure bytestream is NOT a 3-byte noncharacter */ \ ? TRUE \ : (bytelen = 1, FALSE)) /* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns TRUE. * If not a valid 4-byte UTF-8 encoding, this macro returns FALSE, resets bytelen to 1. * Assumes "mbptr" is of type "uchar_ptr_t". */ #define UTF8_VALID_4BYTE(mbptr, bytelen) \ ((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3]) \ && ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ \ && ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */ \ && !UTF8_NONCHAR_4BYTE(mbptr)) /* ensure bytestream is NOT a 4-byte noncharacter */ \ ? TRUE \ : (bytelen = 1, FALSE)) /* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns (mbptr + 2) and sets codepoint appropriately. * If not a valid 2-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF. * Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]". */ #define UTF8_MBTOWC_2BYTE(mbptr, codepoint) \ ((UTF8_IS_VALID_TRAILING(mbptr[1])) \ ? ((codepoint = ((codepoint & UTF8_LEAD_2BYTEMASK) << UTF8_NONLEAD_BITLEN) \ | (mbptr[1] & UTF8_NONLEAD_BYTEMASK)) \ , (mbptr + 2)) \ : (codepoint = (wint_t)WEOF, (mbptr + 1))) /* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns (mbptr + 3) and sets codepoint appropriately. * If not a valid 3-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF. * Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]". */ #define UTF8_MBTOWC_3BYTE(mbptr, codepoint) \ ((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) \ && ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ \ && ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ \ && !UTF8_NONCHAR_3BYTE(mbptr)) /* ensure bytestream is NOT a 3-byte noncharacter */ \ ? ((codepoint = ((((codepoint & UTF8_LEAD_3BYTEMASK) \ << UTF8_NONLEAD_BITLEN) | (mbptr[1] & UTF8_NONLEAD_BYTEMASK)) \ << UTF8_NONLEAD_BITLEN) | (mbptr[2] & UTF8_NONLEAD_BYTEMASK)) \ , (mbptr + 3)) \ : (codepoint = (wint_t)WEOF, (mbptr + 1))) /* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns (mbptr + 4) and sets codepoint appropriately. * If not a valid 4-byte UTF-8 encoding, this macro returns (mbptr + 1), and sets codepoint to WEOF. * Assumes "mbptr" is of type "uchar_ptr_t" and that "codepoint" is already set to "mbptr[0]". */ #define UTF8_MBTOWC_4BYTE(mbptr, codepoint) \ ((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3]) \ && ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ \ && ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */ \ && !UTF8_NONCHAR_4BYTE(mbptr)) /* ensure bytestream is NOT a 4-byte noncharacter */ \ ? ((codepoint = ((((((codepoint & UTF8_LEAD_4BYTEMASK) \ << UTF8_NONLEAD_BITLEN) | (mbptr[1] & UTF8_NONLEAD_BYTEMASK)) \ << UTF8_NONLEAD_BITLEN) | (mbptr[2] & UTF8_NONLEAD_BYTEMASK)) \ << UTF8_NONLEAD_BITLEN) | (mbptr[3] & UTF8_NONLEAD_BYTEMASK)) \ , (mbptr + 4)) \ : (codepoint = (wint_t)WEOF, (mbptr + 1))) /* If mbptr points to a valid 2-byte UTF-8 encoding this macro returns (mbptr + 2). * If not a valid 2-byte UTF-8 encoding, this macro returns (mbptr + 1). * Assumes "mbptr" is of type "uchar_ptr_t". */ #define UTF8_MBNEXT_2BYTE(mbptr) \ (UTF8_IS_VALID_TRAILING(mbptr[1]) \ ? (mbptr + 2) \ : (mbptr + 1)) /* If mbptr points to a valid 3-byte UTF-8 encoding this macro returns (mbptr + 3). * If not a valid 3-byte UTF-8 encoding, this macro returns (mbptr + 1). * Assumes "mbptr" is of type "uchar_ptr_t". */ #define UTF8_MBNEXT_3BYTE(mbptr) \ ((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) \ && ((mbptr[0] != 0xE0) || mbptr[1] >= 0xA0) /* ensure bytestream is above 3-byte UTF-8 BEGIN */ \ && ((mbptr[0] != 0xED) || mbptr[1] <= 0x9F) /* ensure bytestream is NOT a 3-byte UTF-8 surrogate */ \ && !UTF8_NONCHAR_3BYTE(mbptr)) /* ensure bytestream is NOT a 3-byte noncharacter */ \ ? (mbptr + 3) \ : (mbptr + 1)) /* If mbptr points to a valid 4-byte UTF-8 encoding this macro returns (mbptr + 4). * If not a valid 4-byte UTF-8 encoding, this macro returns (mbptr + 1). * Assumes "mbptr" is of type "uchar_ptr_t". */ #define UTF8_MBNEXT_4BYTE(mbptr) \ ((UTF8_IS_VALID_TRAILING(mbptr[1]) && UTF8_IS_VALID_TRAILING(mbptr[2]) && UTF8_IS_VALID_TRAILING(mbptr[3]) \ && ((mbptr[0] != 0xF0) || mbptr[1] >= 0x90) /* ensure bytestream is above 4-byte UTF-8 BEGIN */ \ && ((mbptr[0] != 0xF4) || mbptr[1] <= 0x8F) /* ensure bytestream is below 4-byte UTF-8 END */ \ && !UTF8_NONCHAR_4BYTE(mbptr)) /* ensure bytestream is NOT a 4-byte noncharacter */ \ ? (mbptr + 4) \ : (mbptr + 1)) LITREF unsigned int utf8_bytelen[]; /* boolean_t UTF8_VALID(char *ptr, char *ptrend, unsigned int bytelen) * Inspects bytes of the multi-byte UTF-8 string "ptr" upto "ptrend" and Returns TRUE if the * byte sequence beginning at s forms a welformed and complete UTF-8 character, or FALSE * otherwise. Sets "bytelen" to the byte length of the UTF-8 character if returning TRUE and * to 1 if returning FALSE. A well-formed UTF-8 codepoint that is either a surrogate (in the * range D800 - DFFF) or a noncharacter is considered invalid. This macro assumes that * "ptrend" is at least "ptr+1" and does not do any checks on this. */ #define UTF8_VALID(mbptr, ptrend, bytelen) \ ((bytelen) = utf8_bytelen[((uchar_ptr_t)(mbptr))[0]], \ (((((uchar_ptr_t)(mbptr))[0]) <= ASCII_MAX) ? TRUE /* ASCII. Do simplest check first. */ \ : (((bytelen) == 1) ? FALSE /* Invalid leading byte */ \ : (((int4)(bytelen) > (int4)(((uchar_ptr_t)(ptrend)) - ((uchar_ptr_t)(mbptr)))) \ ? (bytelen = 1, FALSE) /* Not enough length in input string */ \ : ((bytelen) == 2 ? UTF8_VALID_2BYTE(((uchar_ptr_t)(mbptr)), (bytelen)) \ : ((bytelen) == 3 ? UTF8_VALID_3BYTE(((uchar_ptr_t)(mbptr)), (bytelen)) \ : /* bytelen == 4 */UTF8_VALID_4BYTE(((uchar_ptr_t)(mbptr)), (bytelen)) \ )))))) /* boolean_t U_VALID_CODE(wint_t codepoint) * Returns * TRUE if the code point of a character is a valid Unicode code point * FALSE otherwise. * Invalid code points include: * All surrogate code points * All noncharacter code points * All code points greater than U+10FFFF */ #define U_VALID_CODE(codepoint) \ (((unsigned)(codepoint) <= UTF8_4BYTE_MAX) \ && !U_IS_SURROGATE_CODE(codepoint) \ && !UTF8_NONCHAR_CODEPOINT(codepoint)) LITREF signed int utf8_followlen[]; /* int UTF8_MBFOLLOW(char *s) * Inspects only the first byte of a multi-byte (or even an incomplete) UTF-8 string * pointed at s, and returns the numbers of bytes to follow in order to form a complete * character. The possible return values by this macro are 0, 1, 2 or 3. * If the byte stored at s does not form a legal first-byte of UTF-8 character, * it returns -1. */ #define UTF8_MBFOLLOW(mbptr) (utf8_followlen[((uchar_ptr_t)(mbptr))[0]]) /* int UTF16BE_MBFOLLOW(char *mbptr, char *ptrend) * Inspects up to two bytes of a multi-byte (or even an incomplete) UTF-16BE string * pointed at mbptr, and returns the numbers of bytes to follow the byte at mbptr in order * to form a complete UTF-16 character in BIG-ENDIAN format. The valid return values by * this macro are 1 and 3. If the number of bytes between [mbptr, ptrend) is less than 2, * the macro returns -1. */ #define UTF16BE_MBFOLLOW(mbptr, ptrend) \ ((ptrend - mbptr >= 2) ? (UTF16BE_HIGH_SURROGATE(mbptr) ? 3 : 1) : -1) /* int UTF16LE_MBFOLLOW(char *mbptr, char *ptrend) * Inspects up to two bytes of a multi-byte (or even an incomplete) UTF-16LE string * pointed at mbptr, and returns the numbers of bytes to follow the byte at mbptr in order * to form a complete UTF-16 character in LITTLE-ENDIAN format. The valid return values by * this macro are 1 and 3. If the number of bytes between [mbptr, ptrend) is less than 2, * the macro returns -1. */ #define UTF16LE_MBFOLLOW(mbptr, ptrend) \ ((ptrend - mbptr >= 2) ? (UTF16LE_HIGH_SURROGATE(mbptr) ? 3 : 1) : -1) /* boolean_t UTF16BE_VALID(char *ptr, char *ptrend, unsigned int bytelen) * Inspects 2 or 4 bytes of the UTF-16BE string "ptr" upto "ptrend" and Returns TRUE * if the byte sequence beginning at ptr forms a welformed and complete UTF-16 character * in big-endian format, or FALSE otherwise. This macro also sets "bytelen" to 2 (for * BMP characters) or 4 (for surrogate pair). * NOTES: * "bytelen" is always set irrespective of the validity of the code point (eg. * it can be set to 4 for surrogate pair for which the macro returns FALSE * because its code point is not valid (non-character). * * "ptrend" is asummed to be at least "ptr+2" */ #define UTF16BE_VALID(mbptr, ptrend, bytelen) \ (UTF16BE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \ ? (((ptrend - mbptr) >= 4 && UTF16BE_LOW_SURROGATE(mbptr + 2)) \ ? ((UTF16BE_LOAD_SURROGATE(mbptr, bytelen), U_VALID_CODE(bytelen)) \ ? (bytelen = 4, TRUE) : (bytelen = 4, FALSE)) \ : (bytelen = 2, FALSE)) \ : (((bytelen = UTF16BE_GET_UNIT(mbptr)), U_VALID_CODE(bytelen)) \ ? (bytelen = 2, TRUE) : (bytelen = 2, FALSE))) /* boolean_t UTF16LE_VALID(char *ptr, char *ptrend, unsigned int bytelen) * Inspects 2 or 4 bytes of the UTF-16BE string "ptr" upto "ptrend" and Returns TRUE * if the byte sequence beginning at ptr forms a welformed and complete UTF-16 character * in little-endian format, or FALSE otherwise. This macro also sets "bytelen" to 2 (for * BMP characters) or 4 (for surrogate pair). * NOTES: * "bytelen" is always set irrespective of the validity of the code point (eg. * it can be set to 4 for surrogate pair for which the macro returns FALSE * because its code point is not valid (non-character). * * "ptrend" is asummed to be at least "ptr+2" */ #define UTF16LE_VALID(mbptr, ptrend, bytelen) \ (UTF16LE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \ ? (((ptrend - mbptr) >= 4 && UTF16LE_LOW_SURROGATE(mbptr + 2)) \ ? ((UTF16LE_LOAD_SURROGATE(mbptr, bytelen), U_VALID_CODE(bytelen)) \ ? (bytelen = 4, TRUE) : (bytelen = 4, FALSE)) \ : (bytelen = 2, FALSE)) \ : (((bytelen = UTF16LE_GET_UNIT(mbptr)), U_VALID_CODE(bytelen)) \ ? (bytelen = 2, TRUE) : (bytelen = 2, FALSE))) /* unsigned char *UTF8_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint) * Inspects bytes of the UTF-8 string upto ptrend and sets "codepoint" to the code point of * the next character in the string. If the bytes starting from mbptr do not form a complete * wellformed UTF-8 character, it sets "codepoint" to WEOF. Returns (mbptr+len) where "len" * is the byte length of the UTF-8 character found. If "codepoint" is set to WEOF, the return * value is (mbptr+1). */ #define UTF8_MBTOWC(mbptr, ptrend, codepoint) \ ((codepoint) = (wint_t)(((uchar_ptr_t)(mbptr))[0]), \ (((codepoint) <= ASCII_MAX) ? ((uchar_ptr_t)mbptr + 1) /* ASCII. Do simplest check first. */ \ : ((utf8_bytelen[(codepoint)] == 1) /* Invalid leading byte */ \ ? ((codepoint) = (wint_t)WEOF, ((uchar_ptr_t)mbptr + 1)) \ : (((int4)utf8_bytelen[(codepoint)] /* Not enough length in input string */ \ > (int4)(((uchar_ptr_t)(ptrend)) - ((uchar_ptr_t)(mbptr)))) \ ? ((codepoint) = (wint_t)WEOF, ((uchar_ptr_t)mbptr + 1)) \ : (utf8_bytelen[(codepoint)] == 2 ? UTF8_MBTOWC_2BYTE(((uchar_ptr_t)(mbptr)), (codepoint)) \ : (utf8_bytelen[(codepoint)] == 3 ? UTF8_MBTOWC_3BYTE(((uchar_ptr_t)(mbptr)), (codepoint)) \ : /* utf8_bytelen[codepoint] == 4 */UTF8_MBTOWC_4BYTE(((uchar_ptr_t)(mbptr)), (codepoint)) \ )))))) /* unsigned char* UTF8_MBNEXT(char *ptr, char *ptrend) * Assuming that the string pointed at ptr is wellformed, it inspects bytes upto ptrend * and advances the pointer by the number of bytes used by the character pointed at ptr. * It returns the pointer to the beginning of the following character. If the bytes * starting from s do not form a welformed character within the limits defined * by ptrend, it returns the pointer to the next byte (i.e. s+1). */ #define UTF8_MBNEXT(mbptr, ptrend) \ (((((uchar_ptr_t)(mbptr))[0]) <= ASCII_MAX) ? ((uchar_ptr_t)mbptr + 1) /* ASCII. Do simplest check first. */ \ : ((utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 1) /* Invalid leading byte */ \ ? ((uchar_ptr_t)mbptr + 1) \ : (((int4)utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] > (int4)((uchar_ptr_t)ptrend - (uchar_ptr_t)mbptr)) \ ? ((uchar_ptr_t)mbptr + 1) /* Not enough length in input string */ \ : (utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 2 ? UTF8_MBNEXT_2BYTE(((uchar_ptr_t)(mbptr))) \ : (utf8_bytelen[(((uchar_ptr_t)(mbptr))[0])] == 3 ? UTF8_MBNEXT_3BYTE(((uchar_ptr_t)(mbptr))) \ : /* utf8_bytelen[(((uchar_ptr_t)mbptr)[0])] == 4 */UTF8_MBNEXT_4BYTE(((uchar_ptr_t)(mbptr))) \ ))))) /* unsigned char* UTF8_WCTOMB(wint_t c, char *s) * Converts the code point of a character (c) to a sequence of bytes and stores * the result (of 1 to 4 bytes long) at the beginning of the character array pointed * to by s. It returns the pointer advanced by the number of bytes required for c. * For invalid code points no conversion is done and and the macro returns s. */ #define UTF8_WCTOMB(codepoint, mbptr) \ (((unsigned)(codepoint) <= UTF8_1BYTE_MAX) /* 1-byte UTF-8 encoding */ \ ? (*((uchar_ptr_t)mbptr) = (unsigned char)(codepoint), ((uchar_ptr_t)mbptr) + 1) \ : (((unsigned)(codepoint) <= UTF8_2BYTE_MAX) /* 2-byte UTF-8 encoding */ \ ? (*(((uchar_ptr_t)mbptr) + 1) \ = (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX), \ *((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> UTF8_NONLEAD_BITLEN) | UTF8_LEAD_2BYTE_PREFIX), \ ((uchar_ptr_t)mbptr) + 2) \ : (((unsigned)(codepoint) <= UTF8_3BYTE_MAX) /* 3-byte UTF-8 encoding */ \ ? ((U_IS_SURROGATE_CODE(codepoint) || UTF8_NONCHAR_CODE_3BYTE(codepoint)) \ ? ((uchar_ptr_t)mbptr) /* Surrogate or noncharacter (3-byte case) */ \ : (*(((uchar_ptr_t)mbptr) + 2) /* Non-surrogate 3-byte case */ \ = (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX), \ *(((uchar_ptr_t)mbptr) + 1) \ = (unsigned char)((((codepoint) >> UTF8_NONLEAD_BITLEN) & UTF8_NONLEAD_BYTEMASK) \ | UTF8_NONLEAD_BYTE_PREFIX), \ *((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> (2 * UTF8_NONLEAD_BITLEN)) \ | UTF8_LEAD_3BYTE_PREFIX), \ ((uchar_ptr_t)mbptr) + 3)) \ : ((((unsigned)(codepoint) <= UTF8_4BYTE_MAX) && !UTF8_NONCHAR_CODE_4BYTE(codepoint)) /* 4-byte UTF-8 encoding */ \ ? (*(((uchar_ptr_t)mbptr) + 3) \ = (unsigned char)(((codepoint) & UTF8_NONLEAD_BYTEMASK) | UTF8_NONLEAD_BYTE_PREFIX), \ *(((uchar_ptr_t)mbptr) + 2) \ = (unsigned char)((((codepoint) >> UTF8_NONLEAD_BITLEN) & UTF8_NONLEAD_BYTEMASK) \ | UTF8_NONLEAD_BYTE_PREFIX), \ *(((uchar_ptr_t)mbptr) + 1) \ = (unsigned char)((((codepoint) >> (2 * UTF8_NONLEAD_BITLEN)) & UTF8_NONLEAD_BYTEMASK) \ | UTF8_NONLEAD_BYTE_PREFIX), \ *((uchar_ptr_t)mbptr) = (unsigned char)(((codepoint) >> (3 * UTF8_NONLEAD_BITLEN)) \ | UTF8_LEAD_4BYTE_PREFIX), \ ((uchar_ptr_t)mbptr) + 4) \ : ((uchar_ptr_t)mbptr))))) /* boolean_t UTF8_SURROGATE(char* s, char *ptrend) * Inspects bytes of the multi-byte UTF-8 string upto ptrend and Returns TRUE if the * byte sequence beginning at s forms a welformed UTF-8 character and an * isolated surrogate character (either lower surrogate or upper surrogate). * It returns FALSE, otherwise. */ #define UTF8_SURROGATE(mbptr, ptrend) \ (((UTF8_SURROGATE_BYTELEN /* maxlen should be at least 3-bytes */ \ <= ((int4)((uchar_ptr_t)ptrend - (uchar_ptr_t)mbptr))) \ && (((uchar_ptr_t)mbptr)[0] == 0xED) /* leading byte should be 0xED for surrogate UTF-8 */ \ && (((uchar_ptr_t)mbptr)[1] >= 0xA0) /* first non-leading byte should be at least 0xA0 */ \ && (((uchar_ptr_t)mbptr)[1] <= 0xBF) /* first non-leading byte should be at most 0xBF */ \ && (UTF8_IS_VALID_TRAILING(((uchar_ptr_t)mbptr)[2]))) /* second non-leading byte should be valid */ \ ? TRUE : FALSE) /* void UTF8_LEADING_BYTE(char* mbptr, char* baseptr, char* leadptr) * Sets leadptr to point to the leading byte of the UTF-8 character containing the byte * pointed by mbptr. If the byte pointed by mbptr is not part of a valid UTF-8 character, * this macro sets leadptr to mbptr. * NOTE: mbptr and leadptr must not be the same variable. */ #define UTF8_LEADING_BYTE(mbptr, baseptr, leadptr) \ { \ leadptr = mbptr; \ while (leadptr >= baseptr && UTF8_IS_VALID_TRAILING(*(uchar_ptr_t)leadptr)) \ --leadptr; \ if (leadptr < baseptr || !UTF8_IS_VALID_LEADING(*(uchar_ptr_t)leadptr) || \ (mbptr - leadptr) > utf8_followlen[*(uchar_ptr_t)leadptr]) \ leadptr = mbptr; \ } /* Macros to return the UTF-16 (16-bit) code units from a given code point in the supplementary plane. * Note: these macros must be called only for the supplementary code points (> U_BMP_MAX) that are <= UTF8_4BYTE_MAX */ #define UTF16_HIGH_SURROGATE(codepoint) \ (U_HIGH_SURROGATE_BEGIN | ((((codepoint) >> 16) - 1) << 6) | (((codepoint) >> 10) & 0x3F)) #define UTF16_LOW_SURROGATE(codepoint) \ (U_LOW_SURROGATE_BEGIN | ((codepoint) & 0x3FF)) /* Composes a surrogate pair and returns the code point in the supplementary plane */ #define UTF16_COMPOSE_SURROGATES(high, low) \ ((((((high) >> 6) & 0xF) + 1) << 16) | (((high) & 0x3F) << 10) | ((low) & 0x3FF)) /* Macros to convert a UTF-16 (16-bit) code unit into a 2-byte sequence in the appropriate endianness. * The codeunits passed must be less than or equal to U_BMP_MAX */ #define UTF16BE_STORE_UNIT(mbptr, codeunit) \ ((((uchar_ptr_t)mbptr)[0] = ((codeunit) >> 8)), (((uchar_ptr_t)mbptr)[1] = ((codeunit) & 0x00FF))) #define UTF16LE_STORE_UNIT(mbptr, codeunit) \ ((((uchar_ptr_t)mbptr)[1] = ((codeunit) >> 8)), (((uchar_ptr_t)mbptr)[0] = ((codeunit) & 0x00FF))) /* macros to return a single UTF-16 (16-bit) codeunit given a 2-byte sequence */ #define UTF16BE_GET_UNIT(mbptr) \ ((((uchar_ptr_t)mbptr)[0] << 8) | ((uchar_ptr_t)mbptr)[1]) #define UTF16LE_GET_UNIT(mbptr) \ ((((uchar_ptr_t)mbptr)[1] << 8) | ((uchar_ptr_t)mbptr)[0]) /* macros to load UTF-16 surrogate codeunit pairs and return the code point in the supplementary plane. * Note: mbptr must point to a valid 4-byte sequence of high and low surrogates */ #define UTF16BE_LOAD_SURROGATE(mbptr, codepoint) \ (codepoint = UTF16BE_GET_UNIT(mbptr), \ codepoint = UTF16_COMPOSE_SURROGATES(codepoint, UTF16BE_GET_UNIT(mbptr+2))) #define UTF16LE_LOAD_SURROGATE(mbptr, codepoint) \ (codepoint = UTF16LE_GET_UNIT(mbptr), \ codepoint = UTF16_COMPOSE_SURROGATES(codepoint, UTF16LE_GET_UNIT(mbptr+2))) /* char* UTF16BE_WCTOMB(wint_t codepoint, char *mbptr) * Converts the code point of a character (codepoint) in to big-endian UTF-16 bytes * and stores the result (of 2 or 4 bytes long) at the beginning of the character * array pointed to by mbptr. It returns the pointer advanced by the number of bytes * required for codepoint. For invalid code points, no conversion is done and and * the macro returns mbptr. */ #define UTF16BE_WCTOMB(codepoint, mbptr) \ (U_VALID_CODE(codepoint) \ ? ((codepoint) <= U_BMP_MAX \ ? (UTF16BE_STORE_UNIT(mbptr, (codepoint)), mbptr + 2) /* code points in BMP */ \ : (UTF16BE_STORE_UNIT(mbptr, UTF16_HIGH_SURROGATE(codepoint)), /* supplementary plane */\ UTF16BE_STORE_UNIT(mbptr + 2, UTF16_LOW_SURROGATE(codepoint)), mbptr + 4)) \ : mbptr) /* char* UTF16LE_WCTOMB(wint_t codepoint, char *mbptr) * Converts the code point of a character (codepoint) in to little-endian UTF-16 bytes * and stores the result (of 2 or 4 bytes long) at the beginning of the character * array pointed to by mbptr. It returns the pointer advanced by the number of bytes * required for codepoint. For invalid code points, no conversion is done and and * the macro returns mbptr. */ #define UTF16LE_WCTOMB(codepoint, mbptr) \ (U_VALID_CODE(codepoint) \ ? ((codepoint) <= U_BMP_MAX /* 16-bit characters */ \ ? (UTF16LE_STORE_UNIT(mbptr, (codepoint)), mbptr + 2) /* code points in BMP */ \ : (UTF16LE_STORE_UNIT(mbptr, UTF16_HIGH_SURROGATE(codepoint)), /* supplementary plane */\ UTF16LE_STORE_UNIT(mbptr + 2, UTF16_LOW_SURROGATE(codepoint)), mbptr + 4)) \ : mbptr) /* char *UTF16BE_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint) * Inspects 2 bytes (or 4 bytes if surrogates) of the UTF-16 string in big-endian and * sets "codepoint" to the code point of the next character in the string. Returns * (mbptr + len) where "len" is the byte length of the UTF-16BE character found. If * the bytes starting from mbptr do not form a complete welformed UTF-16BE character, * it sets codepoint to WEOF and return mbptr. */ #define UTF16BE_MBTOWC(mbptr, ptrend, codepoint) \ ((UTF16BE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \ ? (((ptrend - mbptr) >= 4 && UTF16BE_LOW_SURROGATE(mbptr + 2)) \ ? UTF16BE_LOAD_SURROGATE(mbptr, codepoint) : (codepoint = WEOF)) \ : (codepoint = UTF16BE_GET_UNIT(mbptr))), \ (U_VALID_CODE(codepoint) /* validate the code point */ \ ? ((codepoint) <= U_BMP_MAX ? (mbptr + 2) : (mbptr + 4)) \ : (((codepoint) = WEOF), mbptr))) /* char *UTF16LE_MBTOWC(char *mbptr, char *ptrend, wint_t codepoint) * Inspects 2 bytes (or 4 bytes if surrogates) of the UTF-16 string in little-endian and * sets "codepoint" to the code point of the next character in the string. Returns * (mbptr + len) where "len" is the byte length of the UTF-16BE character found. If * the bytes starting from mbptr do not form a complete welformed UTF-16LE character, * it sets codepoint to WEOF and return mbptr. */ #define UTF16LE_MBTOWC(mbptr, ptrend, codepoint) \ ((UTF16LE_HIGH_SURROGATE(mbptr) /* compute the code point first */ \ ? (((ptrend - mbptr) >= 4 && UTF16LE_LOW_SURROGATE(mbptr + 2)) \ ? UTF16LE_LOAD_SURROGATE(mbptr, codepoint) : (codepoint = WEOF)) \ : (codepoint = UTF16LE_GET_UNIT(mbptr))), \ (U_VALID_CODE(codepoint) /* validate the code point */ \ ? ((codepoint) <= U_BMP_MAX ? (mbptr + 2) : (mbptr + 4)) \ : (((codepoint) = WEOF), mbptr))) /* boolean_t UTF16BE_HIGH_SURROGATE(char* mbptr) * Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence * beginning at mbptr forms a welformed UTF-16BE high surrogate character (U+D800 - U+DBFF) * and FALSE otherwise. */ #define UTF16BE_HIGH_SURROGATE(mbptr) \ (U_IS_SURROGATE_HIGH(UTF16BE_GET_UNIT(mbptr))) /* boolean_t UTF16LE_HIGH_SURROGATE(char* mbptr) * Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence * beginning at mbptr forms a welformed UTF-16LE high surrogate character (U+D800 - U+DBFF) * and FALSE otherwise. */ #define UTF16LE_HIGH_SURROGATE(mbptr) \ (U_IS_SURROGATE_HIGH(UTF16LE_GET_UNIT(mbptr))) /* boolean_t UTF16BE_LOW_SURROGATE(char* mbptr) * Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence * beginning at mbptr forms a welformed UTF-16BE low surrogate character (U+DC00 - U+DFFF) * and FALSE otherwise. */ #define UTF16BE_LOW_SURROGATE(mbptr) \ (U_IS_SURROGATE_LOW(UTF16BE_GET_UNIT(mbptr))) /* boolean_t UTF16LE_LOW_SURROGATE(char* mbptr) * Inspects at most 2 bytes in the UTF-16 string and Returns TRUE if the byte sequence * beginning at mbptr forms a welformed UTF-16LE low surrogate character (U+DC00 - U+DFFF) * and FALSE otherwise. */ #define UTF16LE_LOW_SURROGATE(mbptr) \ (U_IS_SURROGATE_LOW(UTF16LE_GET_UNIT(mbptr))) /* The following macros provide the character classification for Unicode characters given their code points */ #define U_ISLOWER(c) u_islower(c) #define U_ISUPPER(c) u_isupper(c) #define U_ISALPHA(c) u_isalpha(c) #define U_ISCNTRL(c) u_iscntrl(c) #define U_ISDIGIT(c) u_isdigit(c) #define U_ISPUNCT(c) u_ispunct(c) #define U_ISSPACE(c) u_isspace(c) #define U_ISBLANK(c) u_isblank(c) #define U_ISGRAPH(c) u_isgraph(c) #define U_ISPRINT(c) GTM_U_ISPRINT(c) /* see macro definition for why redirection needed */ #define U_ISTITLE(c) u_istitle(c) #define U_CHARTYPE(c) u_charType(c) /* uint4 CTYPEMASK(wint_t c) * * This macro assumes that "c" is a valid unicode codepoint. * * Returns a patcode from a code point (wide character wint_t) paralleling the way ICU library functions classify codepoints. * u_isalpha (for A) * u_isdigit (for N) * u_ispunct (for P) * u_iscntrl (for C) * But with the following adjustments. * 1) If $ZPATNUMERIC is not "UTF-8", non-ASCII decimal digits are classified as A. * 2) Non-decimal digits (Nl and No) are classified as A. Note: u_isdigit only matches decimal digits. * 3) Anything left is classified via u_isprint into either P or C. Note: u_isprint only matches non-control characters. * Note that the ISV $ZPATN[UMERIC] dictates how the pattern class N used in the pattern match operator is interpreted. * If $ZPATNUMERIC is "UTF-8", the pattern class N matches any decimal numeric character as defined by the Unicode standard. * If $ZPATNUMERIC is "M", GT.M restricts the pattern class N to match only ASCII digits 0-9 (i.e. ASCII 48-57). * The variable "utf8_patnumeric" is TRUE if $ZPATNUMERIC is "UTF-8". * * The above rules result in the following mapping * -------------------------------------------------- * Unicode general category GT.M patcode class * -------------------------------------------------- * L* (all letters) -> A * M* (all marks) -> P * Nd (decimal numbers) -> N (if decimal digit is ASCII or $ZPATNUMERIC is "UTF-8", otherwise -> A) * Nl (letter numbers) -> A (examples of Nl are Roman numerals) * No (other numbers) -> A (examples of No are fractions) * P* (all punctuation) -> P * S* (all symbols) -> P * Zs (spaces) -> P * Zl (line separators) -> C * Zp (paragraph separators) -> C * C* (all control codepoints) -> C * * For a description of the Unicode general categories see http://unicode.org/versions/Unicode4.0.0/ch04.pdf (section 4.5) * * E = A + P + N + C and the classifications A, P, N, and C are mutually exclusive. * * This means that PATM_UTF8_NONBASIC does not currently have any codepoints mapped to it. * It is being retained in case it is needed in the future. */ /* our mask to map non-decimal digits into the PATM_A */ #define GTM_NA_MASK (U_GC_NL_MASK | U_GC_NO_MASK) #define CTYPEMASK(c) \ (U_ISALPHA(c) ? /* alphabet */ \ (U_ISLOWER(c) ? PATM_L /* lower-case */ \ : (U_ISUPPER(c) ? PATM_U /* upper-case */ \ : PATM_UTF8_ALPHABET)) /* unicode alphabet that is neither lower nor upper case */ \ : (U_ISDIGIT(c) /* ascii or non-ascii decimal digit */ \ ? ((utf8_patnumeric || IS_ASCII(c)) /* check $ZPATNUMERIC setting */ \ ? PATM_N /* Ascii digit OR $ZPATNUMERIC set to "UTF-8" */ \ : PATM_UTF8_ALPHABET) /* $ZPATNUMERIC set to "M" and non-ascii decimal digit */ \ : ((U_MASK(U_CHARTYPE(c)) & GTM_NA_MASK)/* put non-decimal digits in */ \ ? PATM_UTF8_ALPHABET /* PATM_UTF8_ALPHABET */ \ : (U_ISPUNCT(c) ? PATM_P /* punctuation */ \ :(U_ISCNTRL(c) ? PATM_C /* control */ \ /* unicode character that is not part of any basic class */ \ :(U_ISPRINT(c) /* if printable */ \ ? PATM_P /* punctuation */ \ : PATM_C)))))) /* otherwise, control */ /* uint4 TYPEMASK(char *ptr, char *ptrend, char *ptrnext, wint_t codepoint) * Inspects bytes of a character (in UTF-8 format) starting at "ptr" upto "ptrend", and returns its patcode. * This macro is a replacment to the existing typemask[] table that works in both UTF-8 and non-UTF8 mode. * This macro should only be used by the compiler. This macro assumes that "gtm_utf8_mode" is TRUE. * The parameter "codepoint" is set to the codepoint which is the same thing that gets returned by the macro. */ #define TYPEMASK(ptr, ptrend, ptrnext, codepoint) \ (IS_ASCII(*(ptr)) \ ? ((ptrnext = ptr + 1), (codepoint) = *(ptr), typemask[*(ptr)]) \ : (ptrnext = UTF8_MBTOWC(ptr, ptrend, codepoint), CTYPEMASK(codepoint))) /* uint4 PATTERN_TYPEMASK(char *ptr, char *ptrend, char *ptrnext, wint_t codepoint) * Inspects bytes of a character (in UTF-8 format) starting at "ptr" upto "ptrend", and returns its patcode. * This macro is a replacment to the existing pattern_typemask[] table that works in both UTF-8 and non-UTF8 mode. * This macro should only be used by the runtime. This macro assumes that "gtm_utf8_mode" is TRUE. * The parameter "codepoint" is set to the codepoint if it is a multi-byte UTF8 character. */ #define PATTERN_TYPEMASK(ptr, ptrend, ptrnext, codepoint) \ (IS_ASCII(*(ptr)) \ ? ((ptrnext = ptr + 1), (codepoint) = *(ptr), pattern_typemask[*(ptr)]) \ : (ptrnext = UTF8_MBTOWC(ptr, ptrend, codepoint), CTYPEMASK(codepoint))) /* Returns the display column width of a character given its code point. This macro * returns -1 for control characters and 0 for non-spacing (combining) characters */ #define UTF8_WCWIDTH(c) gtm_wcwidth((wint_t)(c)) /* The following macro is same as UTF8_WCWIDTH except that it returns 0 for unprintable valid characters as well. * It is primarily used by the IO code. */ #ifdef UNICODE_SUPPORTED #define GTM_IO_WCWIDTH(CHAR,RET) \ if (utf8_active) \ { \ RET = UTF8_WCWIDTH(CHAR); \ RET = (0 > RET ? 0 : RET); \ } else \ RET = 1 #else #define GTM_IO_WCWIDTH(CHAR,RET) RET = 1 #endif /* Offsets for use with u32_line_term[] */ #define U32_LT_LF 0 #define U32_LT_CR 1 #define U32_LT_NL 2 #define U32_LT_FF 3 #define U32_LT_LS 4 #define U32_LT_PS 5 #define U32_LT_LAST 5 /* not counting null sentinel */ #ifdef UNICODE_SUPPORTED #include "gtm_icu_api.h" int trim_U16_line_term(UChar *buffer, int len); #endif /* There could be integral promotion/sign extension issues if short, int (or an integral type) is used for comparison. Avoid * such issues by definining BOM as a string */ #define UTF16BE_BOM "\xFE\xFF" /* Big Endian BYTE ORDER MARKER */ #define UTF16BE_BOM_LEN STR_LIT_LEN(UTF16BE_BOM) #define UTF16LE_BOM "\xFF\xFE" /* Little Endian BYTE ORDER MARKER */ #define UTF16LE_BOM_LEN STR_LIT_LEN(UTF16LE_BOM) #define UTF8_BOM "\xEF\xBB\xBF" /* No relevance to endian-ness, a UTF8 MARKER similar to UTF16_BOM */ #define UTF8_BOM_LEN STR_LIT_LEN(UTF8_BOM) #define UTF32BE_BOM "\x00\x00\xFE\xFF" /* Big Endian BYTE ORDER MARKER */ #define UTF32BE_BOM_LEN STR_LIT_LEN(UTF32BE_BOM) #define UTF32LE_BOM "\xFF\xFE\x00\x00" /* Little Endian BYTE ORDER MARKER */ #define UTF32LE_BOM_LEN STR_LIT_LEN(UTF32LE_BOM) #define BOM_CODEPOINT 0xFEFF #define UTF8_BADCHAR(len, str, strtop, chset_len, chset) \ utf8_badchar((len), (unsigned char *)(str), (unsigned char *)(strtop), (chset_len), (unsigned char *)(chset)) #define UTF8_BADCHAR_STX(len, str, strtop, chset_len, chset) \ utf8_badchar_stx((len), (unsigned char *)(str), (unsigned char *)(strtop), (chset_len), (unsigned char *)(chset)) #define UTF8_LEN_STRICT(ptr, len) \ utf8_len_strict((unsigned char *)(ptr), (len)) /* This macro is needed to to ensure all Unicode line terminators are considered non-printable. As of this * writing, ICU's u_isprint returns TRUE for LS/PS (Line/Paragraph separator; codepoints 0x2028, 0x2029) * and this causes problems in extracting and loading data which contains these codepoints (in UTF8 mode). * Ideally, one should go through all the line terminators in u32_line_term[] array and check them. * But since this routine is performance intensive (called from mupip extract which can take hours for * huge databases) we avoid a loop and just check for LS/PS which we know dont work right with "u_isprint" */ #define GTM_U_ISPRINT(code) \ ((((UChar32)UTF_LINE_SEPARATOR == (UChar32)(code)) || ((UChar32)UTF_PARA_SEPARATOR == (UChar32)(code))) \ ? FALSE \ : u_isprint(code)) GBLREF boolean_t utf8_patnumeric; int utf8_len(mstr* str); int utf8_len_stx(mstr* str); int utf8_len_strict(unsigned char* ptr, int len); int gtm_wcwidth(wint_t code); int gtm_wcswidth(unsigned char* ptr, int len, boolean_t strict, int nonprintwidth); void utf8_badchar(int len, unsigned char* str, unsigned char *strtop, int chset_len, unsigned char* chset); void utf8_badchar_stx(int len, unsigned char* str, unsigned char *strtop, int chset_len, unsigned char* chset); unsigned char *gtm_utf8_trim_invalid_tail(unsigned char *str, int len); /* To prevent GTMSECSHR from pulling in the function "gtmwcswidth" (used in util_output.c) and in turn the entire Unicode * codebase, we define a function-pointer variable and initialize it at startup to NULL only in GTMSECSHR and not-null * in all the other executables. */ typedef int (*gtm_wcswidth_fnptr_t)(unsigned char* ptr, int len, boolean_t strict, int nonprintwidth); GBLREF gtm_wcswidth_fnptr_t gtm_wcswidth_fnptr; /* see comment in gtm_utf8.h about this typedef */ #endif /* GTM_UTF8_H */