From 41ad032151ffaf0b56b18400be61e12e2742a58d Mon Sep 17 00:00:00 2001 From: Joe Skora Date: Mon, 6 Feb 2017 18:55:01 +0000 Subject: [PATCH] NIFI-3055 StandardRecordWriter Can Throw UTFDataFormatException (1.x) * Remove function based on JDK source. * Add new function to find bytes based on RFC3629. * Add field name to log entry when field is truncated. Signed-off-by: Mike Moser This closes #1475 --- .../repository/schema/SchemaRecordWriter.java | 43 +++++----- .../schema/TestSchemaRecordReaderWriter.java | 84 +++++++++---------- .../nifi/provenance/StandardRecordWriter.java | 80 +++++++++--------- 3 files changed, 105 insertions(+), 102 deletions(-) diff --git a/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java b/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java index 81043bc283..3e4a059322 100644 --- a/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java +++ b/nifi-commons/nifi-schema-utils/src/main/java/org/apache/nifi/repository/schema/SchemaRecordWriter.java @@ -113,7 +113,7 @@ public class SchemaRecordWriter { out.writeLong((Long) value); break; case STRING: - writeUTFLimited(out, (String) value); + writeUTFLimited(out, (String) value, field.getFieldName()); break; case LONG_STRING: final byte[] charArray = ((String) value).getBytes(StandardCharsets.UTF_8); @@ -134,7 +134,7 @@ public class SchemaRecordWriter { break; case UNION: final NamedValue namedValue = (NamedValue) value; - writeUTFLimited(out, namedValue.getName()); + writeUTFLimited(out, namedValue.getName(), field.getFieldName()); final Record childRecord = (Record) namedValue.getValue(); writeRecordFields(childRecord, out); break; @@ -145,14 +145,14 @@ public class SchemaRecordWriter { } } - private void writeUTFLimited(final DataOutputStream out, final String utfString) throws IOException { + private void writeUTFLimited(final DataOutputStream out, final String utfString, final String fieldName) throws IOException { try { out.writeUTF(utfString); } catch (UTFDataFormatException e) { - final String truncated = utfString.substring(0, getCharsInUTFLength(utfString, MAX_ALLOWED_UTF_LENGTH)); - logger.warn("Truncating repository record value! Attempted to write {} chars that encode to a UTF byte length greater than " + final String truncated = utfString.substring(0, getCharsInUTF8Limit(utfString, MAX_ALLOWED_UTF_LENGTH)); + logger.warn("Truncating repository record value for field '{}'! Attempted to write {} chars that encode to a UTF8 byte length greater than " + "supported maximum ({}), truncating to {} chars.", - utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length()); + (fieldName == null) ? "" : fieldName, utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length()); if (logger.isDebugEnabled()) { logger.warn("String value was:\n{}", truncated); } @@ -160,28 +160,29 @@ public class SchemaRecordWriter { } } + static int getCharsInUTF8Limit(final String str, final int utf8Limit) { + // Calculate how much of String fits within UTF8 byte limit based on RFC3629. + // + // Java String values use char[] for storage, so character values >0xFFFF that + // map to 4 byte UTF8 representations are not considered. - static int getCharsInUTFLength(final String str, final int utfLimit) { - // see java.io.DataOutputStream.writeUTF() - int strlen = str.length(); - int utflen = 0; - int c; + final int charsInOriginal = str.length(); + int bytesInUTF8 = 0; - /* use charAt instead of copying String to Char array */ - for (int i = 0; i < strlen; i++) { - c = str.charAt(i); - if ((c >= 0x0001) & (c <= 0x007F)) { - utflen++; - } else if (c > 0x07FF) { - utflen += 3; + for (int i = 0; i < charsInOriginal; i++) { + final int curr = str.charAt(i); + if (curr < 0x0080) { + bytesInUTF8++; + } else if (curr < 0x0800) { + bytesInUTF8 += 2; } else { - utflen += 2; + bytesInUTF8 += 3; } - if (utflen > utfLimit) { + if (bytesInUTF8 > utf8Limit) { return i; } } - return strlen; + return charsInOriginal; } } diff --git a/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java b/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java index 5eb815aa16..5dfd40ef4d 100644 --- a/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java +++ b/nifi-commons/nifi-schema-utils/src/test/java/org/apache/nifi/repository/schema/TestSchemaRecordReaderWriter.java @@ -235,52 +235,52 @@ public class TestSchemaRecordReaderWriter { } @Test - public void testSingleCharUTFLengths() { - // verify handling of single characters mapping to 1, 2, and 3 utf byte strings - assertEquals("test 1 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 0)); - assertEquals("test 2 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 0)); - assertEquals("test 3 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 0)); - assertEquals("test 1 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 1)); - assertEquals("test 2 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 1)); - assertEquals("test 3 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 1)); - assertEquals("test 1 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 2)); - assertEquals("test 2 char string truncated to 2 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 2)); - assertEquals("test 3 char string truncated to 2 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 2)); - assertEquals("test 1 char string truncated to 3 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringOneByte, 3)); - assertEquals("test 2 char string truncated to 3 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringTwoByte, 3)); - assertEquals("test 3 char string truncated to 3 utf bytes should be 3", 1, SchemaRecordWriter.getCharsInUTFLength(utfStringThreeByte, 3)); + public void testSingleCharUTF8Lengths() { + // verify handling of single characters mapping to utf8 byte strings + assertEquals("test 1 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 0)); + assertEquals("test 2 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 0)); + assertEquals("test 3 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 0)); + assertEquals("test 1 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 1)); + assertEquals("test 2 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 1)); + assertEquals("test 3 char string truncated to 1 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 1)); + assertEquals("test 1 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 2)); + assertEquals("test 2 char string truncated to 2 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 2)); + assertEquals("test 3 char string truncated to 2 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 2)); + assertEquals("test 1 char string truncated to 3 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringOneByte, 3)); + assertEquals("test 2 char string truncated to 3 utf bytes should be 2", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringTwoByte, 3)); + assertEquals("test 3 char string truncated to 3 utf bytes should be 3", 1, SchemaRecordWriter.getCharsInUTF8Limit(utfStringThreeByte, 3)); } @Test public void testMultiCharUTFLengths() { // test boundary conditions as 1, 2, and 3 UTF byte chars are included into utf limit positions used by strings final String testString1 = utfStringOneByte + utfStringTwoByte + utfStringThreeByte; // char 'abc' utf 'abbccc' - assertEquals("test 6 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(testString1, 0)); // utf '' - assertEquals("test 6 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(testString1, 1)); // utf 'a' - assertEquals("test 6 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTFLength(testString1, 2)); // utf 'a' - assertEquals("test 6 char string truncated to 3 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 3)); // utf 'abb' - assertEquals("test 6 char string truncated to 4 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 4)); // utf 'abb' - assertEquals("test 6 char string truncated to 5 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTFLength(testString1, 5)); // utf 'abb' - assertEquals("test 6 char string truncated to 6 utf bytes should be 3", 3, SchemaRecordWriter.getCharsInUTFLength(testString1, 6)); // utf 'abbccc' + assertEquals("test 6 char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 0)); // utf '' + assertEquals("test 6 char string truncated to 1 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 1)); // utf 'a' + assertEquals("test 6 char string truncated to 2 utf bytes should be 1", 1, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 2)); // utf 'a' + assertEquals("test 6 char string truncated to 3 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 3)); // utf 'abb' + assertEquals("test 6 char string truncated to 4 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 4)); // utf 'abb' + assertEquals("test 6 char string truncated to 5 utf bytes should be 2", 2, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 5)); // utf 'abb' + assertEquals("test 6 char string truncated to 6 utf bytes should be 3", 3, SchemaRecordWriter.getCharsInUTF8Limit(testString1, 6)); // utf 'abbccc' } @Test public void testSmallCharUTFLengths() throws UnsupportedEncodingException { final String string12b = StringUtils.repeat(utfStringOneByte + utfStringTwoByte + utfStringThreeByte, 2); - assertEquals("test multi-char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTFLength(string12b, 0)); - assertEquals("test multi-char string truncated to 1 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTFLength(string12b, 1)); - assertEquals("test multi-char string truncated to 2 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTFLength(string12b, 2)); - assertEquals("test multi-char string truncated to 3 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b, 3)); - assertEquals("test multi-char string truncated to 4 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b, 4)); - assertEquals("test multi-char string truncated to 5 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTFLength(string12b, 5)); - assertEquals("test multi-char string truncated to 6 utf bytes should be 0", 3, SchemaRecordWriter.getCharsInUTFLength(string12b, 6)); - assertEquals("test multi-char string truncated to 7 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTFLength(string12b, 7)); - assertEquals("test multi-char string truncated to 8 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTFLength(string12b, 8)); - assertEquals("test multi-char string truncated to 9 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 9)); - assertEquals("test multi-char string truncated to 10 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 10)); - assertEquals("test multi-char string truncated to 11 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTFLength(string12b, 11)); - assertEquals("test multi-char string truncated to 12 utf bytes should be 0", 6, SchemaRecordWriter.getCharsInUTFLength(string12b, 12)); + assertEquals("test multi-char string truncated to 0 utf bytes should be 0", 0, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 0)); + assertEquals("test multi-char string truncated to 1 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 1)); + assertEquals("test multi-char string truncated to 2 utf bytes should be 0", 1, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 2)); + assertEquals("test multi-char string truncated to 3 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 3)); + assertEquals("test multi-char string truncated to 4 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 4)); + assertEquals("test multi-char string truncated to 5 utf bytes should be 0", 2, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 5)); + assertEquals("test multi-char string truncated to 6 utf bytes should be 0", 3, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 6)); + assertEquals("test multi-char string truncated to 7 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 7)); + assertEquals("test multi-char string truncated to 8 utf bytes should be 0", 4, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 8)); + assertEquals("test multi-char string truncated to 9 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 9)); + assertEquals("test multi-char string truncated to 10 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 10)); + assertEquals("test multi-char string truncated to 11 utf bytes should be 0", 5, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 11)); + assertEquals("test multi-char string truncated to 12 utf bytes should be 0", 6, SchemaRecordWriter.getCharsInUTF8Limit(string12b, 12)); } @Test @@ -290,16 +290,16 @@ public class TestSchemaRecordReaderWriter { assertEquals("test 64k char string should be 64k chars long", 65535, string64k.length()); // drop half the chars going to utf of 64k bytes -- (1+1+1) * 21845 = 65535 chars which converts to (1+2+3) * 21845 = 131070 utf bytes so 1/2 is truncated - assertEquals("test 64k char string truncated to 65,535 utf bytes should be 32768", 32768, SchemaRecordWriter.getCharsInUTFLength(string64k, 65535)); + assertEquals("test 64k char string truncated to 65,535 utf bytes should be 32768", 32768, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65535)); // dropping bytes off the end of utf length - assertEquals("test 64k char string truncated to 65,534 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTFLength(string64k, 65534)); // lost 2 byte char - assertEquals("test 64k char string truncated to 65,533 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTFLength(string64k, 65533)); - assertEquals("test 64k char string truncated to 65,532 utf bytes should be 32766", 32766, SchemaRecordWriter.getCharsInUTFLength(string64k, 65532)); // lost 1 byte char - assertEquals("test 64k char string truncated to 65,531 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 65531)); // lost 3 byte char - assertEquals("test 64k char string truncated to 65,530 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 65530)); - assertEquals("test 64k char string truncated to 65,529 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTFLength(string64k, 65529)); - assertEquals("test 64k char string truncated to 65,528 utf bytes should be 32764", 32764, SchemaRecordWriter.getCharsInUTFLength(string64k, 65528)); // lost 2 byte char (again) + assertEquals("test 64k char string truncated to 65,534 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65534)); // lost 2 byte char + assertEquals("test 64k char string truncated to 65,533 utf bytes should be 32767", 32767, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65533)); + assertEquals("test 64k char string truncated to 65,532 utf bytes should be 32766", 32766, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65532)); // lost 1 byte char + assertEquals("test 64k char string truncated to 65,531 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65531)); // lost 3 byte char + assertEquals("test 64k char string truncated to 65,530 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65530)); + assertEquals("test 64k char string truncated to 65,529 utf bytes should be 32765", 32765, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65529)); + assertEquals("test 64k char string truncated to 65,528 utf bytes should be 32764", 32764, SchemaRecordWriter.getCharsInUTF8Limit(string64k, 65528)); // lost 2 byte char (again) } private SimpleRecordField createField(final String fieldName, final FieldType type) { diff --git a/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java b/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java index 076e507e52..46967672c9 100644 --- a/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java +++ b/nifi-nar-bundles/nifi-provenance-repository-bundle/nifi-persistent-provenance-repository/src/main/java/org/apache/nifi/provenance/StandardRecordWriter.java @@ -76,16 +76,16 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re final ProvenanceEventType recordType = record.getEventType(); out.writeLong(recordIdentifier); - writeUTFLimited(out, record.getEventType().name()); + writeUTFLimited(out, record.getEventType().name(), "EventType"); out.writeLong(record.getEventTime()); out.writeLong(record.getFlowFileEntryDate()); out.writeLong(record.getEventDuration()); out.writeLong(record.getLineageStartDate()); - writeNullableString(out, record.getComponentId()); - writeNullableString(out, record.getComponentType()); + writeNullableString(out, record.getComponentId(), "ComponentId"); + writeNullableString(out, record.getComponentType(), "ComponentType"); writeUUID(out, record.getFlowFileUuid()); - writeNullableString(out, record.getDetails()); + writeNullableString(out, record.getDetails(), "Details"); // Write FlowFile attributes final Map attrs = record.getPreviousAttributes(); @@ -105,9 +105,9 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re // If Content Claim Info is present, write out a 'TRUE' followed by claim info. Else, write out 'false'. if (record.getContentClaimSection() != null && record.getContentClaimContainer() != null && record.getContentClaimIdentifier() != null) { out.writeBoolean(true); - writeUTFLimited(out, record.getContentClaimContainer()); - writeUTFLimited(out, record.getContentClaimSection()); - writeUTFLimited(out, record.getContentClaimIdentifier()); + writeUTFLimited(out, record.getContentClaimContainer(), "ContentClaimContainer"); + writeUTFLimited(out, record.getContentClaimSection(), "ContentClaimSection"); + writeUTFLimited(out, record.getContentClaimIdentifier(), "ContentClaimIdentifier"); if (record.getContentClaimOffset() == null) { out.writeLong(0L); } else { @@ -121,9 +121,9 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re // If Previous Content Claim Info is present, write out a 'TRUE' followed by claim info. Else, write out 'false'. if (record.getPreviousContentClaimSection() != null && record.getPreviousContentClaimContainer() != null && record.getPreviousContentClaimIdentifier() != null) { out.writeBoolean(true); - writeUTFLimited(out, record.getPreviousContentClaimContainer()); - writeUTFLimited(out, record.getPreviousContentClaimSection()); - writeUTFLimited(out, record.getPreviousContentClaimIdentifier()); + writeUTFLimited(out, record.getPreviousContentClaimContainer(), "PreviousContentClaimContainer"); + writeUTFLimited(out, record.getPreviousContentClaimSection(), "PreviousContentClaimSection"); + writeUTFLimited(out, record.getPreviousContentClaimIdentifier(), "PreviousContentClaimIdentifier"); if (record.getPreviousContentClaimOffset() == null) { out.writeLong(0L); } else { @@ -140,28 +140,28 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re } // write out the identifier of the destination queue. - writeNullableString(out, record.getSourceQueueIdentifier()); + writeNullableString(out, record.getSourceQueueIdentifier(), "SourceQueueIdentifier"); // Write type-specific info if (recordType == ProvenanceEventType.FORK || recordType == ProvenanceEventType.JOIN || recordType == ProvenanceEventType.CLONE || recordType == ProvenanceEventType.REPLAY) { writeUUIDs(out, record.getParentUuids()); writeUUIDs(out, record.getChildUuids()); } else if (recordType == ProvenanceEventType.RECEIVE) { - writeNullableString(out, record.getTransitUri()); - writeNullableString(out, record.getSourceSystemFlowFileIdentifier()); + writeNullableString(out, record.getTransitUri(), "TransitUri"); + writeNullableString(out, record.getSourceSystemFlowFileIdentifier(), "SourceSystemFlowFileIdentifier"); } else if (recordType == ProvenanceEventType.FETCH) { - writeNullableString(out, record.getTransitUri()); + writeNullableString(out, record.getTransitUri(), "TransitUri"); } else if (recordType == ProvenanceEventType.SEND) { - writeNullableString(out, record.getTransitUri()); + writeNullableString(out, record.getTransitUri(), "TransitUri"); } else if (recordType == ProvenanceEventType.ADDINFO) { - writeNullableString(out, record.getAlternateIdentifierUri()); + writeNullableString(out, record.getAlternateIdentifierUri(), "AlternateIdentifierUri"); } else if (recordType == ProvenanceEventType.ROUTE) { - writeNullableString(out, record.getRelationship()); + writeNullableString(out, record.getRelationship(), "Relationship"); } } protected void writeUUID(final DataOutputStream out, final String uuid) throws IOException { - writeUTFLimited(out, uuid); + writeUTFLimited(out, uuid, "UUID"); } protected void writeUUIDs(final DataOutputStream out, final Collection list) throws IOException { @@ -175,12 +175,12 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re } } - protected void writeNullableString(final DataOutputStream out, final String toWrite) throws IOException { + protected void writeNullableString(final DataOutputStream out, final String toWrite, String fieldName) throws IOException { if (toWrite == null) { out.writeBoolean(false); } else { out.writeBoolean(true); - writeUTFLimited(out, toWrite); + writeUTFLimited(out, toWrite, fieldName); } } @@ -199,14 +199,14 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re out.write(bytes); } - private void writeUTFLimited(final java.io.DataOutputStream out, final String utfString) throws IOException { + private void writeUTFLimited(final DataOutputStream out, final String utfString, final String fieldName) throws IOException { try { out.writeUTF(utfString); } catch (UTFDataFormatException e) { - final String truncated = utfString.substring(0, getCharsInUTFLength(utfString, MAX_ALLOWED_UTF_LENGTH)); - logger.warn("Truncating repository record value! Attempted to write {} chars that encode to a UTF byte length greater than " + final String truncated = utfString.substring(0, getCharsInUTF8Limit(utfString, MAX_ALLOWED_UTF_LENGTH)); + logger.warn("Truncating repository record value for field '{}'! Attempted to write {} chars that encode to a UTF8 byte length greater than " + "supported maximum ({}), truncating to {} chars.", - utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length()); + (fieldName == null) ? "" : fieldName, utfString.length(), MAX_ALLOWED_UTF_LENGTH, truncated.length()); if (logger.isDebugEnabled()) { logger.warn("String value was:\n{}", truncated); } @@ -214,27 +214,29 @@ public class StandardRecordWriter extends CompressableRecordWriter implements Re } } - static int getCharsInUTFLength(final String str, final int utfLimit) { - // see java.io.DataOutputStream.writeUTF() - int strlen = str.length(); - int utflen = 0; - int c; + static int getCharsInUTF8Limit(final String str, final int utf8Limit) { + // Calculate how much of String fits within UTF8 byte limit based on RFC3629. + // + // Java String values use char[] for storage, so character values >0xFFFF that + // map to 4 byte UTF8 representations are not considered. - /* use charAt instead of copying String to Char array */ - for (int i = 0; i < strlen; i++) { - c = str.charAt(i); - if ((c >= 0x0001) & (c <= 0x007F)) { - utflen++; - } else if (c > 0x07FF) { - utflen += 3; + final int charsInOriginal = str.length(); + int bytesInUTF8 = 0; + + for (int i = 0; i < charsInOriginal; i++) { + final int curr = str.charAt(i); + if (curr < 0x0080) { + bytesInUTF8++; + } else if (curr < 0x0800) { + bytesInUTF8 += 2; } else { - utflen += 2; + bytesInUTF8 += 3; } - if (utflen > utfLimit) { + if (bytesInUTF8 > utf8Limit) { return i; } } - return strlen; + return charsInOriginal; }