169 lines
6.8 KiB
C
169 lines
6.8 KiB
C
/****************************************************************
|
|
* *
|
|
* Copyright 2006, 2011 Fidelity Information Services, Inc *
|
|
* *
|
|
* This source code contains the intellectual property *
|
|
* of its copyright holder(s), and is made available *
|
|
* under a license. If you do not know the terms of *
|
|
* the license, please stop and do not read further. *
|
|
* *
|
|
****************************************************************/
|
|
|
|
#include "mdef.h"
|
|
#include "gtm_string.h"
|
|
#include "op.h"
|
|
#include "stringpool.h"
|
|
#include "gtm_icu_api.h"
|
|
#include "gtm_conv.h"
|
|
#include "gtm_utf8.h"
|
|
|
|
GBLREF boolean_t gtm_utf8_mode;
|
|
GBLREF spdesc stringpool;
|
|
GBLREF casemap_t casemaps[];
|
|
|
|
error_def(ERR_BADCASECODE);
|
|
error_def(ERR_BADCHSET);
|
|
error_def(ERR_ICUERROR);
|
|
error_def(ERR_MAXSTRLEN);
|
|
error_def(ERR_INVFCN);
|
|
error_def(ERR_TEXT);
|
|
|
|
#define RELEASE_IF_NOT_LOCAL(ptr, local) ((ptr) != (local)) ? (free(ptr), (ptr = NULL)) : ptr;
|
|
|
|
/**************************************************************************************************
|
|
* Routine to perform string-level case conversion to "upper", "lower" and "title" case.
|
|
* Since ICU only supports API using UTF-16 representation, case conversion of UTF-8 strings involves
|
|
* encoding conversion as described below:
|
|
* 1. First, the UTF-8 source string is converted to UTF-16 representation (u_strFromUTF8())
|
|
* which is stored in a local buffer of size MAX_ZCONVBUFF. If this space is not sufficient,
|
|
* we try to allocate it in heap.
|
|
* 2. Since case conversion may expand the string, we compute the desired space required by
|
|
* preflighting the ICU case conversion API and then allocate the space before performing
|
|
* the real conversion.
|
|
* 3. Translating the converted UTF-16 string back to UTF-8 is done in stringpool (with similar
|
|
* preflighting to compute the required space.
|
|
* NOTE:
|
|
* Malloc is used only if the size exceeds 2K characters (a very unlikely situation esp. with
|
|
* case conversion).
|
|
*
|
|
***************************************************************************************************/
|
|
void op_fnzconvert2(mval *src, mval *kase, mval *dst)
|
|
{
|
|
int index;
|
|
int32_t src_ustr_len, src_chlen, dst_chlen, ulen, dstlen = 0;
|
|
UErrorCode status;
|
|
char *dstbase;
|
|
UChar src_ustr[MAX_ZCONVBUFF], dst_ustr[MAX_ZCONVBUFF], *src_ustr_ptr, *dst_ustr_ptr;
|
|
|
|
MV_FORCE_STR(kase);
|
|
if (-1 == (index = verify_case(&kase->str)))
|
|
rts_error(VARLSTCNT(4) ERR_BADCASECODE, 2, kase->str.len, kase->str.addr);
|
|
|
|
MV_FORCE_STR(src);
|
|
/* allocate stringpool */
|
|
if (!gtm_utf8_mode)
|
|
{
|
|
dstlen = src->str.len;
|
|
ENSURE_STP_FREE_SPACE(dstlen);
|
|
dstbase = (char *)stringpool.free;
|
|
assert(NULL != casemaps[index].m);
|
|
(*casemaps[index].m)((unsigned char *)dstbase, (unsigned char *)src->str.addr, dstlen);
|
|
} else if (0 != src->str.len)
|
|
{
|
|
MV_FORCE_LEN_STRICT(src);
|
|
if (2 * src->str.char_len <= MAX_ZCONVBUFF)
|
|
{ /* Check if the stack buffer is sufficient considering the worst case where all
|
|
characters are surrogate pairs, each of which needs 2 UChars */
|
|
src_ustr_ptr = src_ustr;
|
|
src_ustr_len = MAX_ZCONVBUFF;
|
|
} else
|
|
{ /* Fake the conversion from UTF-8 to UTF-16 to compute the required number of UChars */
|
|
status = U_ZERO_ERROR;
|
|
u_strFromUTF8(NULL, 0, &src_ustr_len, src->str.addr, src->str.len, &status);
|
|
if (U_FAILURE(status))
|
|
{ /* Since there is no ustring.h API to return the actual illegal sequence,
|
|
* we need to search the UTF-8 source to identify the BADCHAR sequence
|
|
* might get buffer overflow on the way to ensuring enough space
|
|
*/
|
|
if (U_FAILURE(status) && (U_BUFFER_OVERFLOW_ERROR != status))
|
|
{
|
|
if (U_ILLEGAL_CHAR_FOUND == status || U_INVALID_CHAR_FOUND == status)
|
|
utf8_len_strict((unsigned char *)src->str.addr, src->str.len);
|
|
rts_error(VARLSTCNT(3) ERR_ICUERROR, 1, status); /* ICU said bad, we say good */
|
|
}
|
|
}
|
|
src_ustr_ptr = (UChar*)malloc(src_ustr_len * SIZEOF(UChar));
|
|
}
|
|
/* Convert UTF-8 src to UTF-16 (UChar*) representation */
|
|
status = U_ZERO_ERROR;
|
|
u_strFromUTF8(src_ustr_ptr, src_ustr_len, &src_chlen, src->str.addr, src->str.len, &status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
RELEASE_IF_NOT_LOCAL(src_ustr_ptr, src_ustr);
|
|
if (U_ILLEGAL_CHAR_FOUND == status || U_INVALID_CHAR_FOUND == status)
|
|
utf8_len_strict((unsigned char *)src->str.addr, src->str.len); /* to report BADCHAR error */
|
|
rts_error(VARLSTCNT(3) ERR_ICUERROR, 1, status); /* ICU said bad, we say good or don't recognize error*/
|
|
}
|
|
/* Next, fake conversion to compute the required buffer size (aka preflighting in ICU) */
|
|
status = U_ZERO_ERROR;
|
|
dst_chlen = (*casemaps[index].u)(NULL, 0, src_ustr_ptr, src_chlen, NULL, &status);
|
|
assert(U_BUFFER_OVERFLOW_ERROR == status);
|
|
if (dst_chlen > MAX_ZCONVBUFF) /* conversion increases the string length, allocate in heap instead */
|
|
dst_ustr_ptr = (UChar*)malloc(dst_chlen * SIZEOF(UChar));
|
|
else
|
|
dst_ustr_ptr = dst_ustr;
|
|
/* Now, perform the real conversion with sufficient buffers */
|
|
status = U_ZERO_ERROR;
|
|
dst_chlen = (*casemaps[index].u)(dst_ustr_ptr, dst_chlen, src_ustr_ptr, src_chlen, NULL, &status);
|
|
if (U_FAILURE(status))
|
|
GTMASSERT;
|
|
RELEASE_IF_NOT_LOCAL(src_ustr_ptr, src_ustr);
|
|
/* Fake the conversion from UTF-16 to UTF-8 to compute the required buffer size */
|
|
status = U_ZERO_ERROR;
|
|
dstlen = 0;
|
|
u_strToUTF8(NULL, 0, &dstlen, dst_ustr_ptr, dst_chlen, &status);
|
|
assert(U_BUFFER_OVERFLOW_ERROR == status || U_SUCCESS(status));
|
|
if (MAX_STRLEN < dstlen)
|
|
{
|
|
RELEASE_IF_NOT_LOCAL(dst_ustr_ptr, dst_ustr);
|
|
rts_error(VARLSTCNT(1) ERR_MAXSTRLEN);
|
|
}
|
|
ENSURE_STP_FREE_SPACE(dstlen);
|
|
dstbase = (char *)stringpool.free;
|
|
status = U_ZERO_ERROR;
|
|
u_strToUTF8(dstbase, dstlen, &ulen, dst_ustr_ptr, dst_chlen, &status);
|
|
if (U_FAILURE(status))
|
|
rts_error(VARLSTCNT(3) ERR_ICUERROR, 1, status); /* ICU said bad, but same call above just returned OK */
|
|
if (ulen != dstlen)
|
|
GTMASSERT;
|
|
RELEASE_IF_NOT_LOCAL(dst_ustr_ptr, dst_ustr);
|
|
}
|
|
MV_INIT_STRING(dst, dstlen, dstbase);
|
|
stringpool.free += dstlen;
|
|
}
|
|
|
|
void op_fnzconvert3(mval *src, mval* ichset, mval* ochset, mval* dst)
|
|
{
|
|
UConverter *from, *to;
|
|
int dstlen;
|
|
|
|
MV_FORCE_STR(src);
|
|
if (!gtm_utf8_mode)
|
|
{ /* Unicode not enabled, report error rather than silently ignoring the conversion */
|
|
rts_error(VARLSTCNT(6) ERR_INVFCN, 0, ERR_TEXT, 2,
|
|
LEN_AND_LIT("Three-argument form of $ZCONVERT() is not allowed in the current $ZCHSET"));
|
|
}
|
|
MV_FORCE_STR(ichset);
|
|
MV_FORCE_STR(ochset);
|
|
/* The only supported names are: "UTF-8", "UTF-16", "UTF-16LE" and "UTF-16BE */
|
|
if (NULL == (from = get_chset_desc(&ichset->str)))
|
|
rts_error(VARLSTCNT(4) ERR_BADCHSET, 2, ichset->str.len, ichset->str.addr);
|
|
if (NULL == (to = get_chset_desc(&ochset->str)))
|
|
rts_error(VARLSTCNT(4) ERR_BADCHSET, 2, ochset->str.len, ochset->str.addr);
|
|
|
|
dstlen = gtm_conv(from, to, &src->str, NULL, NULL);
|
|
assert(-1 != dstlen);
|
|
MV_INIT_STRING(dst, dstlen, stringpool.free);
|
|
stringpool.free += dst->str.len;
|
|
}
|