NIFI-12238 Fix SplitText endline trimming with max fragment size (#7892)

This commit is contained in:
Gabor Gyimesi 2023-10-18 18:40:52 +02:00 committed by GitHub
parent 574c2b2168
commit 91e4b453b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 13 deletions

View File

@ -459,13 +459,6 @@ public class SplitText extends AbstractProcessor {
while ((offsetInfo = demarcator.nextOffsetInfo()) != null) { while ((offsetInfo = demarcator.nextOffsetInfo()) != null) {
lastCrlfLength = offsetInfo.getCrlfLength(); lastCrlfLength = offsetInfo.getCrlfLength();
if (offsetInfo.getLength() == offsetInfo.getCrlfLength()) {
trailingCrlfLength += offsetInfo.getCrlfLength();
trailingLineCount++;
} else if (offsetInfo.getLength() > offsetInfo.getCrlfLength()) {
trailingCrlfLength = 0; // non-empty line came in, thus resetting counter
}
if (length + offsetInfo.getLength() + startingLength > this.maxSplitSize) { if (length + offsetInfo.getLength() + startingLength > this.maxSplitSize) {
if (length == 0) { // single line per split if (length == 0) { // single line per split
length += offsetInfo.getLength(); length += offsetInfo.getLength();
@ -474,14 +467,21 @@ public class SplitText extends AbstractProcessor {
remaningOffsetInfo = offsetInfo; remaningOffsetInfo = offsetInfo;
} }
break; break;
} else { }
if (offsetInfo.getLength() == offsetInfo.getCrlfLength()) {
trailingCrlfLength += offsetInfo.getCrlfLength();
trailingLineCount++;
} else if (offsetInfo.getLength() > offsetInfo.getCrlfLength()) {
trailingCrlfLength = 0; // non-empty line came in, thus resetting counter
}
length += offsetInfo.getLength(); length += offsetInfo.getLength();
actualLineCount++; actualLineCount++;
if (splitMaxLineCount > 0 && actualLineCount >= splitMaxLineCount) { if (splitMaxLineCount > 0 && actualLineCount >= splitMaxLineCount) {
break; break;
} }
} }
}
if (actualLineCount > 0) { if (actualLineCount > 0) {
if (length - trailingCrlfLength >= lastCrlfLength) { if (length - trailingCrlfLength >= lastCrlfLength) {

View File

@ -890,4 +890,25 @@ public class TestSplitText {
splits.get(1).assertContentEquals("\n"); splits.get(1).assertContentEquals("\n");
} }
@Test
public void testMaxFragmentSizeWithTrimmedEndlines() {
final TestRunner splitRunner = TestRunners.newTestRunner(new SplitText());
splitRunner.setProperty(SplitText.HEADER_LINE_COUNT, "2");
splitRunner.setProperty(SplitText.LINE_SPLIT_COUNT, "0");
splitRunner.setProperty(SplitText.FRAGMENT_MAX_SIZE, "30 B");
splitRunner.setProperty(SplitText.REMOVE_TRAILING_NEWLINES, "true");
splitRunner.enqueue("header1\nheader2\nline1 longer than limit\nline2\nline3\n\n\n\n\n");
splitRunner.run();
splitRunner.assertTransferCount(SplitText.REL_SPLITS, 3);
splitRunner.assertTransferCount(SplitText.REL_ORIGINAL, 1);
splitRunner.assertTransferCount(SplitText.REL_FAILURE, 0);
final List<MockFlowFile> splits = splitRunner.getFlowFilesForRelationship(SplitText.REL_SPLITS);
splits.get(0).assertContentEquals("header1\nheader2\nline1 longer than limit");
splits.get(1).assertContentEquals("header1\nheader2\nline2\nline3");
splits.get(2).assertContentEquals("header1\nheader2");
}
} }