NIFI-3055 StandardRecordWriter Can Throw UTFDataFormatException (1.x)

* Remove function based on JDK source.
* Add new function to find bytes based on RFC3629.
* Add field name to log entry when field is truncated.

Signed-off-by: Mike Moser <mosermw@apache.org>
This closes #1475
This commit is contained in:
Joe Skora 2017-02-06 18:55:01 +00:00 committed by Mike Moser
parent 2bc7d52626
commit 41ad032151
3 changed files with 105 additions and 102 deletions

View File

@ -113,7 +113,7 @@ public class SchemaRecordWriter {
out.writeLong((Long) value);
break;
case STRING:
writeUTFLimited(out, (String) value);
writeUTFLimited(out, (String) value, field.getFieldName());
break;
case LONG_STRING:
final byte[] charArray = ((String) value).getBytes(StandardCharsets.UTF_8);
@ -134,7 +134,7 @@ public class SchemaRecordWriter {
break;
case UNION:
final NamedValue namedValue = (NamedValue) value;
writeUTFLimited(out, namedValue.getName());
writeUTFLimited(out, namedValue.getName(), field.getFieldName());
final Record childRecord = (Record) namedValue.getValue();
writeRecordFields(childRecord, out);
break;
@ -145,14 +145,14 @@ public class SchemaRecordWriter {
}
}
private void writeUTFLimited(final DataOutputStream out, final String utfString) throws IOException {
private void writeUTFLimited(final DataOutputStream out, final String utfString, final String fieldName) throws IOException {
try {
out.writeUTF(utfString);
} catch (UTFDataFormatException e) {
final String truncated = utfString.substring(0, getCharsInUTFLength(utfString, MAX_ALLOWED_UTF_LENGTH));
logger.warn("Truncating repository record value! Attempted to write {} chars that encode to a UTF byte length greater than "
final String truncated = utfString.substring(0, getCharsInUTF8Limit(utfString, MAX_ALLOWED_UTF_LENGTH));
logger.warn("Truncating repository record value for field '{}'! Attempted to write {} chars that encode to a UTF8 byte length greater than "
+ "supported maximum ({}), truncating to {} chars.",
utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length());
(fieldName == null) ? "" : fieldName, utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length());
if (logger.isDebugEnabled()) {
logger.warn("String value was:\n{}", truncated);
}
@ -160,28 +160,29 @@ public class SchemaRecordWriter {
}
}
static int getCharsInUTF8Limit(final String str, final int utf8Limit) {
// Calculate how much of String fits within UTF8 byte limit based on RFC3629.
//
// Java String values use char[] for storage, so character values >0xFFFF that
// map to 4 byte UTF8 representations are not considered.
static int getCharsInUTFLength(final String str, final int utfLimit) {
// see java.io.DataOutputStream.writeUTF()
int strlen = str.length();
int utflen = 0;
int c;
final int charsInOriginal = str.length();
int bytesInUTF8 = 0;
/* use charAt instead of copying String to Char array */
for (int i = 0; i < strlen; i++) {
c = str.charAt(i);
if ((c >= 0x0001) & (c <= 0x007F)) {
utflen++;
} else if (c > 0x07FF) {
utflen += 3;
for (int i = 0; i < charsInOriginal; i++) {
final int curr = str.charAt(i);
if (curr < 0x0080) {
bytesInUTF8++;
} else if (curr < 0x0800) {
bytesInUTF8 += 2;
} else {
utflen += 2;
bytesInUTF8 += 3;
}
if (utflen > utfLimit) {
if (bytesInUTF8 > utf8Limit) {
return i;
}
}
return strlen;
return charsInOriginal;
}
}

View File

@ -235,52 +235,52 @@ public class TestSchemaRecordReaderWriter {
}
@Test
public void testSingleCharUTFLengths() {
// verify handling of single characters mapping to 1, 2, and 3 utf byte strings
assertEquals("test 1 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 0));
assertEquals("test 2 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 0));
assertEquals("test 3 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 0));
assertEquals("test 1 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 1));
assertEquals("test 2 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 1));
assertEquals("test 3 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 1));
assertEquals("test 1 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 2));
assertEquals("test 2 char string truncated to 2 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 2));
assertEquals("test 3 char string truncated to 2 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 2));
assertEquals("test 1 char string truncated to 3 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 3));
assertEquals("test 2 char string truncated to 3 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 3));
assertEquals("test 3 char string truncated to 3 utf bytes should be 3", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 3));
public void testSingleCharUTF8Lengths() {
// verify handling of single characters mapping to utf8 byte strings
assertEquals("test 1 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 0));
assertEquals("test 2 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 0));
assertEquals("test 3 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 0));
assertEquals("test 1 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 1));
assertEquals("test 2 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 1));
assertEquals("test 3 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 1));
assertEquals("test 1 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 2));
assertEquals("test 2 char string truncated to 2 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 2));
assertEquals("test 3 char string truncated to 2 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 2));
assertEquals("test 1 char string truncated to 3 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 3));
assertEquals("test 2 char string truncated to 3 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 3));
assertEquals("test 3 char string truncated to 3 utf bytes should be 3", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 3));
}
@Test
public void testMultiCharUTFLengths() {
// test boundary conditions as 1, 2, and 3 UTF byte chars are included into utf limit positions used by strings
final String testString1 = utfStringOneByte + utfStringTwoByte + utfStringThreeByte; // char 'abc' utf 'abbccc'
assertEquals("test 6 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(testString1, 0)); // utf ''
assertEquals("test 6 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(testString1, 1)); // utf 'a'
assertEquals("test 6 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(testString1, 2)); // utf 'a'
assertEquals("test 6 char string truncated to 3 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 3)); // utf 'abb'
assertEquals("test 6 char string truncated to 4 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 4)); // utf 'abb'
assertEquals("test 6 char string truncated to 5 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 5)); // utf 'abb'
assertEquals("test 6 char string truncated to 6 utf bytes should be 3", 3, SchemaRecordWriter.getCharsInUTFLength(testString1, 6)); // utf 'abbccc'
assertEquals("test 6 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 0)); // utf ''
assertEquals("test 6 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 1)); // utf 'a'
assertEquals("test 6 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 2)); // utf 'a'
assertEquals("test 6 char string truncated to 3 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 3)); // utf 'abb'
assertEquals("test 6 char string truncated to 4 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 4)); // utf 'abb'
assertEquals("test 6 char string truncated to 5 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 5)); // utf 'abb'
assertEquals("test 6 char string truncated to 6 utf bytes should be 3", 3, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 6)); // utf 'abbccc'
}
@Test
public void testSmallCharUTFLengths() throws UnsupportedEncodingException {
final String string12b = StringUtils.repeat(utfStringOneByte + utfStringTwoByte + utfStringThreeByte, 2);
assertEquals("test multi-char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(string12b, 0));
assertEquals("test multi-char string truncated to 1 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTFLength(string12b, 1));
assertEquals("test multi-char string truncated to 2 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTFLength(string12b, 2));
assertEquals("test multi-char string truncated to 3 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b, 3));
assertEquals("test multi-char string truncated to 4 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b, 4));
assertEquals("test multi-char string truncated to 5 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b, 5));
assertEquals("test multi-char string truncated to 6 utf bytes should be 0", 3, SchemaRecordWriter.getCharsInUTFLength(string12b, 6));
assertEquals("test multi-char string truncated to 7 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTFLength(string12b, 7));
assertEquals("test multi-char string truncated to 8 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTFLength(string12b, 8));
assertEquals("test multi-char string truncated to 9 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 9));
assertEquals("test multi-char string truncated to 10 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 10));
assertEquals("test multi-char string truncated to 11 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 11));
assertEquals("test multi-char string truncated to 12 utf bytes should be 0", 6, SchemaRecordWriter.getCharsInUTFLength(string12b, 12));
assertEquals("test multi-char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 0));
assertEquals("test multi-char string truncated to 1 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 1));
assertEquals("test multi-char string truncated to 2 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 2));
assertEquals("test multi-char string truncated to 3 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 3));
assertEquals("test multi-char string truncated to 4 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 4));
assertEquals("test multi-char string truncated to 5 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 5));
assertEquals("test multi-char string truncated to 6 utf bytes should be 0", 3, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 6));
assertEquals("test multi-char string truncated to 7 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 7));
assertEquals("test multi-char string truncated to 8 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 8));
assertEquals("test multi-char string truncated to 9 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 9));
assertEquals("test multi-char string truncated to 10 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 10));
assertEquals("test multi-char string truncated to 11 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 11));
assertEquals("test multi-char string truncated to 12 utf bytes should be 0", 6, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 12));
}
@Test
@ -290,16 +290,16 @@ public class TestSchemaRecordReaderWriter {
assertEquals("test 64k char string should be 64k chars long", 65535, string64k.length());
// drop half the chars going to utf of 64k bytes -- (1+1+1) * 21845 = 65535 chars which converts to (1+2+3) * 21845 = 131070 utf bytes so 1/2 is truncated
assertEquals("test 64k char string truncated to 65,535 utf bytes should be 32768", 32768, SchemaRecordWriter.getCharsInUTFLength(string64k, 65535));
assertEquals("test 64k char string truncated to 65,535 utf bytes should be 32768", 32768, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65535));
// dropping bytes off the end of utf length
assertEquals("test 64k char string truncated to 65,534 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTFLength(string64k, 65534)); // lost 2 byte char
assertEquals("test 64k char string truncated to 65,533 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTFLength(string64k, 65533));
assertEquals("test 64k char string truncated to 65,532 utf bytes should be 32766", 32766, SchemaRecordWriter.getCharsInUTFLength(string64k, 65532)); // lost 1 byte char
assertEquals("test 64k char string truncated to 65,531 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 65531)); // lost 3 byte char
assertEquals("test 64k char string truncated to 65,530 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 65530));
assertEquals("test 64k char string truncated to 65,529 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 65529));
assertEquals("test 64k char string truncated to 65,528 utf bytes should be 32764", 32764, SchemaRecordWriter.getCharsInUTFLength(string64k, 65528)); // lost 2 byte char (again)
assertEquals("test 64k char string truncated to 65,534 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65534)); // lost 2 byte char
assertEquals("test 64k char string truncated to 65,533 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65533));
assertEquals("test 64k char string truncated to 65,532 utf bytes should be 32766", 32766, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65532)); // lost 1 byte char
assertEquals("test 64k char string truncated to 65,531 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65531)); // lost 3 byte char
assertEquals("test 64k char string truncated to 65,530 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65530));
assertEquals("test 64k char string truncated to 65,529 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65529));
assertEquals("test 64k char string truncated to 65,528 utf bytes should be 32764", 32764, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65528)); // lost 2 byte char (again)
}
private SimpleRecordField createField(final String fieldName, final FieldType type) {

View File

@ -76,16 +76,16 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re
final ProvenanceEventType recordType = record.getEventType();
out.writeLong(recordIdentifier);
writeUTFLimited(out, record.getEventType().name());
writeUTFLimited(out, record.getEventType().name(), "EventType");
out.writeLong(record.getEventTime());
out.writeLong(record.getFlowFileEntryDate());
out.writeLong(record.getEventDuration());
out.writeLong(record.getLineageStartDate());
writeNullableString(out, record.getComponentId());
writeNullableString(out, record.getComponentType());
writeNullableString(out, record.getComponentId(), "ComponentId");
writeNullableString(out, record.getComponentType(), "ComponentType");
writeUUID(out, record.getFlowFileUuid());
writeNullableString(out, record.getDetails());
writeNullableString(out, record.getDetails(), "Details");
// Write FlowFile attributes
final Map<String, String> attrs = record.getPreviousAttributes();
@ -105,9 +105,9 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re
// If Content Claim Info is present, write out a 'TRUE' followed by claim info. Else, write out 'false'.
if (record.getContentClaimSection() != null && record.getContentClaimContainer() != null && record.getContentClaimIdentifier() != null) {
out.writeBoolean(true);
writeUTFLimited(out, record.getContentClaimContainer());
writeUTFLimited(out, record.getContentClaimSection());
writeUTFLimited(out, record.getContentClaimIdentifier());
writeUTFLimited(out, record.getContentClaimContainer(), "ContentClaimContainer");
writeUTFLimited(out, record.getContentClaimSection(), "ContentClaimSection");
writeUTFLimited(out, record.getContentClaimIdentifier(), "ContentClaimIdentifier");
if (record.getContentClaimOffset() == null) {
out.writeLong(0L);
} else {
@ -121,9 +121,9 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re
// If Previous Content Claim Info is present, write out a 'TRUE' followed by claim info. Else, write out 'false'.
if (record.getPreviousContentClaimSection() != null && record.getPreviousContentClaimContainer() != null && record.getPreviousContentClaimIdentifier() != null) {
out.writeBoolean(true);
writeUTFLimited(out, record.getPreviousContentClaimContainer());
writeUTFLimited(out, record.getPreviousContentClaimSection());
writeUTFLimited(out, record.getPreviousContentClaimIdentifier());
writeUTFLimited(out, record.getPreviousContentClaimContainer(), "PreviousContentClaimContainer");
writeUTFLimited(out, record.getPreviousContentClaimSection(), "PreviousContentClaimSection");
writeUTFLimited(out, record.getPreviousContentClaimIdentifier(), "PreviousContentClaimIdentifier");
if (record.getPreviousContentClaimOffset() == null) {
out.writeLong(0L);
} else {
@ -140,28 +140,28 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re
}
// write out the identifier of the destination queue.
writeNullableString(out, record.getSourceQueueIdentifier());
writeNullableString(out, record.getSourceQueueIdentifier(), "SourceQueueIdentifier");
// Write type-specific info
if (recordType == ProvenanceEventType.FORK || recordType == ProvenanceEventType.JOIN || recordType == ProvenanceEventType.CLONE || recordType == ProvenanceEventType.REPLAY) {
writeUUIDs(out, record.getParentUuids());
writeUUIDs(out, record.getChildUuids());
} else if (recordType == ProvenanceEventType.RECEIVE) {
writeNullableString(out, record.getTransitUri());
writeNullableString(out, record.getSourceSystemFlowFileIdentifier());
writeNullableString(out, record.getTransitUri(), "TransitUri");
writeNullableString(out, record.getSourceSystemFlowFileIdentifier(), "SourceSystemFlowFileIdentifier");
} else if (recordType == ProvenanceEventType.FETCH) {
writeNullableString(out, record.getTransitUri());
writeNullableString(out, record.getTransitUri(), "TransitUri");
} else if (recordType == ProvenanceEventType.SEND) {
writeNullableString(out, record.getTransitUri());
writeNullableString(out, record.getTransitUri(), "TransitUri");
} else if (recordType == ProvenanceEventType.ADDINFO) {
writeNullableString(out, record.getAlternateIdentifierUri());
writeNullableString(out, record.getAlternateIdentifierUri(), "AlternateIdentifierUri");
} else if (recordType == ProvenanceEventType.ROUTE) {
writeNullableString(out, record.getRelationship());
writeNullableString(out, record.getRelationship(), "Relationship");
}
}
protected void writeUUID(final DataOutputStream out, final String uuid) throws IOException {
writeUTFLimited(out, uuid);
writeUTFLimited(out, uuid, "UUID");
}
protected void writeUUIDs(final DataOutputStream out, final Collection<String> list) throws IOException {
@ -175,12 +175,12 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re
}
}
protected void writeNullableString(final DataOutputStream out, final String toWrite) throws IOException {
protected void writeNullableString(final DataOutputStream out, final String toWrite, String fieldName) throws IOException {
if (toWrite == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
writeUTFLimited(out, toWrite);
writeUTFLimited(out, toWrite, fieldName);
}
}
@ -199,14 +199,14 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re
out.write(bytes);
}
private void writeUTFLimited(final java.io.DataOutputStream out, final String utfString) throws IOException {
private void writeUTFLimited(final DataOutputStream out, final String utfString, final String fieldName) throws IOException {
try {
out.writeUTF(utfString);
} catch (UTFDataFormatException e) {
final String truncated = utfString.substring(0, getCharsInUTFLength(utfString, MAX_ALLOWED_UTF_LENGTH));
logger.warn("Truncating repository record value! Attempted to write {} chars that encode to a UTF byte length greater than "
final String truncated = utfString.substring(0, getCharsInUTF8Limit(utfString, MAX_ALLOWED_UTF_LENGTH));
logger.warn("Truncating repository record value for field '{}'! Attempted to write {} chars that encode to a UTF8 byte length greater than "
+ "supported maximum ({}), truncating to {} chars.",
utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length());
(fieldName == null) ? "" : fieldName, utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length());
if (logger.isDebugEnabled()) {
logger.warn("String value was:\n{}", truncated);
}
@ -214,27 +214,29 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re
}
}
static int getCharsInUTFLength(final String str, final int utfLimit) {
// see java.io.DataOutputStream.writeUTF()
int strlen = str.length();
int utflen = 0;
int c;
static int getCharsInUTF8Limit(final String str, final int utf8Limit) {
// Calculate how much of String fits within UTF8 byte limit based on RFC3629.
//
// Java String values use char[] for storage, so character values >0xFFFF that
// map to 4 byte UTF8 representations are not considered.
/* use charAt instead of copying String to Char array */
for (int i = 0; i < strlen; i++) {
c = str.charAt(i);
if ((c >= 0x0001) & (c <= 0x007F)) {
utflen++;
} else if (c > 0x07FF) {
utflen += 3;
final int charsInOriginal = str.length();
int bytesInUTF8 = 0;
for (int i = 0; i < charsInOriginal; i++) {
final int curr = str.charAt(i);
if (curr < 0x0080) {
bytesInUTF8++;
} else if (curr < 0x0800) {
bytesInUTF8 += 2;
} else {
utflen += 2;
bytesInUTF8 += 3;
}
if (utflen > utfLimit) {
if (bytesInUTF8 > utf8Limit) {
return i;
}
}
return strlen;
return charsInOriginal;
}