/**************************************************************** * * * Copyright 2006, 2011 Fidelity Information Services, Inc * * * * This source code contains the intellectual property * * of its copyright holder(s), and is made available * * under a license. If you do not know the terms of * * the license, please stop and do not read further. * * * ****************************************************************/ #include "mdef.h" #include "gtm_string.h" #include "op.h" #include "stringpool.h" #include "gtm_icu_api.h" #include "gtm_conv.h" #include "gtm_utf8.h" GBLREF boolean_t gtm_utf8_mode; GBLREF spdesc stringpool; GBLREF casemap_t casemaps[]; error_def(ERR_BADCASECODE); error_def(ERR_BADCHSET); error_def(ERR_ICUERROR); error_def(ERR_MAXSTRLEN); error_def(ERR_INVFCN); error_def(ERR_TEXT); #define RELEASE_IF_NOT_LOCAL(ptr, local) ((ptr) != (local)) ? (free(ptr), (ptr = NULL)) : ptr; /************************************************************************************************** * Routine to perform string-level case conversion to "upper", "lower" and "title" case. * Since ICU only supports API using UTF-16 representation, case conversion of UTF-8 strings involves * encoding conversion as described below: * 1. First, the UTF-8 source string is converted to UTF-16 representation (u_strFromUTF8()) * which is stored in a local buffer of size MAX_ZCONVBUFF. If this space is not sufficient, * we try to allocate it in heap. * 2. Since case conversion may expand the string, we compute the desired space required by * preflighting the ICU case conversion API and then allocate the space before performing * the real conversion. * 3. Translating the converted UTF-16 string back to UTF-8 is done in stringpool (with similar * preflighting to compute the required space. * NOTE: * Malloc is used only if the size exceeds 2K characters (a very unlikely situation esp. with * case conversion). * ***************************************************************************************************/ void op_fnzconvert2(mval *src, mval *kase, mval *dst) { int index; int32_t src_ustr_len, src_chlen, dst_chlen, ulen, dstlen = 0; UErrorCode status; char *dstbase; UChar src_ustr[MAX_ZCONVBUFF], dst_ustr[MAX_ZCONVBUFF], *src_ustr_ptr, *dst_ustr_ptr; MV_FORCE_STR(kase); if (-1 == (index = verify_case(&kase->str))) rts_error(VARLSTCNT(4) ERR_BADCASECODE, 2, kase->str.len, kase->str.addr); MV_FORCE_STR(src); /* allocate stringpool */ if (!gtm_utf8_mode) { dstlen = src->str.len; ENSURE_STP_FREE_SPACE(dstlen); dstbase = (char *)stringpool.free; assert(NULL != casemaps[index].m); (*casemaps[index].m)((unsigned char *)dstbase, (unsigned char *)src->str.addr, dstlen); } else if (0 != src->str.len) { MV_FORCE_LEN_STRICT(src); if (2 * src->str.char_len <= MAX_ZCONVBUFF) { /* Check if the stack buffer is sufficient considering the worst case where all characters are surrogate pairs, each of which needs 2 UChars */ src_ustr_ptr = src_ustr; src_ustr_len = MAX_ZCONVBUFF; } else { /* Fake the conversion from UTF-8 to UTF-16 to compute the required number of UChars */ status = U_ZERO_ERROR; u_strFromUTF8(NULL, 0, &src_ustr_len, src->str.addr, src->str.len, &status); if (U_FAILURE(status)) { /* Since there is no ustring.h API to return the actual illegal sequence, * we need to search the UTF-8 source to identify the BADCHAR sequence * might get buffer overflow on the way to ensuring enough space */ if (U_FAILURE(status) && (U_BUFFER_OVERFLOW_ERROR != status)) { if (U_ILLEGAL_CHAR_FOUND == status || U_INVALID_CHAR_FOUND == status) utf8_len_strict((unsigned char *)src->str.addr, src->str.len); rts_error(VARLSTCNT(3) ERR_ICUERROR, 1, status); /* ICU said bad, we say good */ } } src_ustr_ptr = (UChar*)malloc(src_ustr_len * SIZEOF(UChar)); } /* Convert UTF-8 src to UTF-16 (UChar*) representation */ status = U_ZERO_ERROR; u_strFromUTF8(src_ustr_ptr, src_ustr_len, &src_chlen, src->str.addr, src->str.len, &status); if (U_FAILURE(status)) { RELEASE_IF_NOT_LOCAL(src_ustr_ptr, src_ustr); if (U_ILLEGAL_CHAR_FOUND == status || U_INVALID_CHAR_FOUND == status) utf8_len_strict((unsigned char *)src->str.addr, src->str.len); /* to report BADCHAR error */ rts_error(VARLSTCNT(3) ERR_ICUERROR, 1, status); /* ICU said bad, we say good or don't recognize error*/ } /* Next, fake conversion to compute the required buffer size (aka preflighting in ICU) */ status = U_ZERO_ERROR; dst_chlen = (*casemaps[index].u)(NULL, 0, src_ustr_ptr, src_chlen, NULL, &status); assert(U_BUFFER_OVERFLOW_ERROR == status); if (dst_chlen > MAX_ZCONVBUFF) /* conversion increases the string length, allocate in heap instead */ dst_ustr_ptr = (UChar*)malloc(dst_chlen * SIZEOF(UChar)); else dst_ustr_ptr = dst_ustr; /* Now, perform the real conversion with sufficient buffers */ status = U_ZERO_ERROR; dst_chlen = (*casemaps[index].u)(dst_ustr_ptr, dst_chlen, src_ustr_ptr, src_chlen, NULL, &status); if (U_FAILURE(status)) GTMASSERT; RELEASE_IF_NOT_LOCAL(src_ustr_ptr, src_ustr); /* Fake the conversion from UTF-16 to UTF-8 to compute the required buffer size */ status = U_ZERO_ERROR; dstlen = 0; u_strToUTF8(NULL, 0, &dstlen, dst_ustr_ptr, dst_chlen, &status); assert(U_BUFFER_OVERFLOW_ERROR == status || U_SUCCESS(status)); if (MAX_STRLEN < dstlen) { RELEASE_IF_NOT_LOCAL(dst_ustr_ptr, dst_ustr); rts_error(VARLSTCNT(1) ERR_MAXSTRLEN); } ENSURE_STP_FREE_SPACE(dstlen); dstbase = (char *)stringpool.free; status = U_ZERO_ERROR; u_strToUTF8(dstbase, dstlen, &ulen, dst_ustr_ptr, dst_chlen, &status); if (U_FAILURE(status)) rts_error(VARLSTCNT(3) ERR_ICUERROR, 1, status); /* ICU said bad, but same call above just returned OK */ if (ulen != dstlen) GTMASSERT; RELEASE_IF_NOT_LOCAL(dst_ustr_ptr, dst_ustr); } MV_INIT_STRING(dst, dstlen, dstbase); stringpool.free += dstlen; } void op_fnzconvert3(mval *src, mval* ichset, mval* ochset, mval* dst) { UConverter *from, *to; int dstlen; MV_FORCE_STR(src); if (!gtm_utf8_mode) { /* Unicode not enabled, report error rather than silently ignoring the conversion */ rts_error(VARLSTCNT(6) ERR_INVFCN, 0, ERR_TEXT, 2, LEN_AND_LIT("Three-argument form of $ZCONVERT() is not allowed in the current $ZCHSET")); } MV_FORCE_STR(ichset); MV_FORCE_STR(ochset); /* The only supported names are: "UTF-8", "UTF-16", "UTF-16LE" and "UTF-16BE */ if (NULL == (from = get_chset_desc(&ichset->str))) rts_error(VARLSTCNT(4) ERR_BADCHSET, 2, ichset->str.len, ichset->str.addr); if (NULL == (to = get_chset_desc(&ochset->str))) rts_error(VARLSTCNT(4) ERR_BADCHSET, 2, ochset->str.len, ochset->str.addr); dstlen = gtm_conv(from, to, &src->str, NULL, NULL); assert(-1 != dstlen); MV_INIT_STRING(dst, dstlen, stringpool.free); stringpool.free += dst->str.len; }