diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml index e514b21466..1048db9509 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/pom.xml @@ -697,6 +697,7 @@ src/test/resources/TestUnpackContent/folder/cal.txt src/test/resources/TestUnpackContent/folder/date.txt src/test/resources/TestUnpackContent/invalid_data.zip + src/test/resources/TestUnpackContent/windows-with-cp437.zip src/test/resources/TestUpdateRecord/input/addresses.json src/test/resources/TestUpdateRecord/input/embedded-string.json src/test/resources/TestUpdateRecord/input/multi-arrays.json diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/UnpackContent.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/UnpackContent.java index 3b736aaa6c..34b8573389 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/UnpackContent.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/main/java/org/apache/nifi/processors/standard/UnpackContent.java @@ -16,6 +16,7 @@ */ package org.apache.nifi.processors.standard; +import java.nio.charset.Charset; import net.lingala.zip4j.io.inputstream.ZipInputStream; import net.lingala.zip4j.model.LocalFileHeader; import net.lingala.zip4j.model.enums.EncryptionMethod; @@ -24,6 +25,7 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.io.Charsets; import org.apache.nifi.annotation.behavior.InputRequirement; import org.apache.nifi.annotation.behavior.InputRequirement.Requirement; import org.apache.nifi.annotation.behavior.ReadsAttribute; @@ -34,6 +36,7 @@ import org.apache.nifi.annotation.behavior.WritesAttributes; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.SeeAlso; import org.apache.nifi.annotation.documentation.Tags; +import org.apache.nifi.annotation.documentation.UseCase; import org.apache.nifi.annotation.lifecycle.OnScheduled; import org.apache.nifi.annotation.lifecycle.OnStopped; import org.apache.nifi.components.PropertyDescriptor; @@ -104,6 +107,13 @@ import java.util.regex.Pattern; @WritesAttribute(attribute = "file.permissions", description = "The read/write/execute permissions of the unpacked file (tar only)"), @WritesAttribute(attribute = "file.encryptionMethod", description = "The encryption method for entries in Zip archives")}) @SeeAlso(MergeContent.class) +@UseCase( + description = "Unpack Zip containing filenames with special characters, created on Windows with filename charset 'Cp437' or 'IBM437'.", + configuration = """ + Set "Packaging Format" value to "zip" or "use mime.type attribute". + Set "Filename Character Set" value to "Cp437" or "IBM437". + """ +) public class UnpackContent extends AbstractProcessor { // attribute keys public static final String FRAGMENT_ID = FragmentAttributes.FRAGMENT_ID.key(); @@ -139,6 +149,21 @@ public class UnpackContent extends AbstractProcessor { PackageFormat.FLOWFILE_STREAM_FORMAT_V2.toString(), PackageFormat.FLOWFILE_TAR_FORMAT.toString()) .defaultValue(PackageFormat.AUTO_DETECT_FORMAT.toString()) .build(); + public static final PropertyDescriptor ZIP_FILENAME_CHARSET = new PropertyDescriptor.Builder() + .name("Filename Character Set") + .displayName("Filename Character Set") + .description( + "If supplied this character set will be supplied to the Zip utility to attempt to decode filenames using the specific character set. " + + "If not specified the default platform character set will be used. This is useful if a Zip was created with a different character " + + "set than the platform default and the zip uses non standard values to specify.") + .required(false) + .dependsOn( + PACKAGING_FORMAT, + PackageFormat.ZIP_FORMAT.toString(), + PackageFormat.AUTO_DETECT_FORMAT.toString()) + .addValidator(StandardValidators.CHARACTER_SET_VALIDATOR) + .defaultValue(Charset.defaultCharset().toString()) + .build(); public static final PropertyDescriptor FILE_FILTER = new PropertyDescriptor.Builder() .name("File Filter") @@ -192,6 +217,7 @@ public class UnpackContent extends AbstractProcessor { private static final List properties = List.of( PACKAGING_FORMAT, + ZIP_FILENAME_CHARSET, FILE_FILTER, PASSWORD, ALLOW_STORED_ENTRIES_WITH_DATA_DESCRIPTOR @@ -231,7 +257,10 @@ public class UnpackContent extends AbstractProcessor { } final PropertyValue allowStoredEntriesWithDataDescriptorVal = context.getProperty(ALLOW_STORED_ENTRIES_WITH_DATA_DESCRIPTOR); final boolean allowStoredEntriesWithDataDescriptor = allowStoredEntriesWithDataDescriptorVal.isSet() ? allowStoredEntriesWithDataDescriptorVal.asBoolean() : false; - zipUnpacker = new ZipUnpacker(fileFilter, password, allowStoredEntriesWithDataDescriptor); + + final String filenamesEncodingVal = context.getProperty(ZIP_FILENAME_CHARSET).getValue(); + Charset filenamesEncoding =Charsets.toCharset(filenamesEncodingVal); + zipUnpacker = new ZipUnpacker(fileFilter, password, allowStoredEntriesWithDataDescriptor, filenamesEncoding); } } @@ -267,36 +296,33 @@ public class UnpackContent extends AbstractProcessor { // set the Unpacker to use for this FlowFile. FlowFileUnpackager objects maintain state and are not reusable. final Unpacker unpacker; - final boolean addFragmentAttrs; - switch (packagingFormat) { - case TAR_FORMAT: - case X_TAR_FORMAT: - unpacker = tarUnpacker; - addFragmentAttrs = true; - break; - case ZIP_FORMAT: - unpacker = zipUnpacker; - addFragmentAttrs = true; - break; - case FLOWFILE_STREAM_FORMAT_V2: - unpacker = new FlowFileStreamUnpacker(new FlowFileUnpackagerV2()); - addFragmentAttrs = false; - break; - case FLOWFILE_STREAM_FORMAT_V3: - unpacker = new FlowFileStreamUnpacker(new FlowFileUnpackagerV3()); - addFragmentAttrs = false; - break; - case FLOWFILE_TAR_FORMAT: - unpacker = new FlowFileStreamUnpacker(new FlowFileUnpackagerV1()); - addFragmentAttrs = false; - break; - case AUTO_DETECT_FORMAT: - default: - // The format of the unpacker should be known before initialization - throw new ProcessException(packagingFormat + " is not a valid packaging format"); - } + final boolean addFragmentAttrs = switch (packagingFormat) { + case TAR_FORMAT, X_TAR_FORMAT -> { + unpacker = tarUnpacker; + yield true; + } + case ZIP_FORMAT -> { + unpacker = zipUnpacker; + yield true; + } + case FLOWFILE_STREAM_FORMAT_V2 -> { + unpacker = new FlowFileStreamUnpacker(new FlowFileUnpackagerV2()); + yield false; + } + case FLOWFILE_STREAM_FORMAT_V3 -> { + unpacker = new FlowFileStreamUnpacker(new FlowFileUnpackagerV3()); + yield false; + } + case FLOWFILE_TAR_FORMAT -> { + unpacker = new FlowFileStreamUnpacker(new FlowFileUnpackagerV1()); + yield false; + } + default -> + // The format of the unpacker should be known before initialization + throw new ProcessException(packagingFormat + " is not a valid packaging format"); + }; - final List unpacked = new ArrayList<>(); + final List unpacked = new ArrayList<>(); try { unpacker.unpack(session, flowFile, unpacked); if (unpacked.isEmpty()) { @@ -309,7 +335,7 @@ public class UnpackContent extends AbstractProcessor { finishFragmentAttributes(session, flowFile, unpacked); } session.transfer(unpacked, REL_SUCCESS); - final String fragmentId = unpacked.size() > 0 ? unpacked.get(0).getAttribute(FRAGMENT_ID) : null; + final String fragmentId = !unpacked.isEmpty() ? unpacked.getFirst().getAttribute(FRAGMENT_ID) : null; flowFile = FragmentAttributes.copyAttributesToOriginal(session, flowFile, fragmentId, unpacked.size()); session.transfer(flowFile, REL_ORIGINAL); session.getProvenanceReporter().fork(flowFile, unpacked); @@ -395,20 +421,21 @@ public class UnpackContent extends AbstractProcessor { private static class ZipUnpacker extends Unpacker { private final char[] password; private final boolean allowStoredEntriesWithDataDescriptor; - - public ZipUnpacker(final Pattern fileFilter, final char[] password, final boolean allowStoredEntriesWithDataDescriptor) { + private final Charset filenameEncoding; + public ZipUnpacker(final Pattern fileFilter, final char[] password, final boolean allowStoredEntriesWithDataDescriptor,final Charset filenameEncoding) { super(fileFilter); this.password = password; this.allowStoredEntriesWithDataDescriptor = allowStoredEntriesWithDataDescriptor; + this.filenameEncoding = filenameEncoding; } @Override public void unpack(final ProcessSession session, final FlowFile source, final List unpacked) { final String fragmentId = UUID.randomUUID().toString(); if (password == null) { - session.read(source, new CompressedZipInputStreamCallback(fileFilter, session, source, unpacked, fragmentId, allowStoredEntriesWithDataDescriptor)); + session.read(source, new CompressedZipInputStreamCallback(fileFilter, session, source, unpacked, fragmentId, allowStoredEntriesWithDataDescriptor,filenameEncoding)); } else { - session.read(source, new EncryptedZipInputStreamCallback(fileFilter, session, source, unpacked, fragmentId, password)); + session.read(source, new EncryptedZipInputStreamCallback(fileFilter, session, source, unpacked, fragmentId, password,filenameEncoding)); } } @@ -473,6 +500,7 @@ public class UnpackContent extends AbstractProcessor { private static class CompressedZipInputStreamCallback extends ZipInputStreamCallback { private final boolean allowStoredEntriesWithDataDescriptor; + private final Charset filenameEncoding; private CompressedZipInputStreamCallback( final Pattern fileFilter, @@ -480,15 +508,18 @@ public class UnpackContent extends AbstractProcessor { final FlowFile sourceFlowFile, final List unpacked, final String fragmentId, - final boolean allowStoredEntriesWithDataDescriptor + final boolean allowStoredEntriesWithDataDescriptor, + final Charset filenameEncoding ) { super(fileFilter, session, sourceFlowFile, unpacked, fragmentId); this.allowStoredEntriesWithDataDescriptor = allowStoredEntriesWithDataDescriptor; + this.filenameEncoding = filenameEncoding; } @Override public void process(final InputStream inputStream) throws IOException { - try (final ZipArchiveInputStream zipInputStream = new ZipArchiveInputStream(new BufferedInputStream(inputStream), null, true, allowStoredEntriesWithDataDescriptor)) { + try (final ZipArchiveInputStream zipInputStream = new ZipArchiveInputStream(new BufferedInputStream(inputStream), + filenameEncoding.toString(), true, allowStoredEntriesWithDataDescriptor)) { ZipArchiveEntry zipEntry; while ((zipEntry = zipInputStream.getNextZipEntry()) != null) { processEntry(zipInputStream, zipEntry.isDirectory(), zipEntry.getName(), EncryptionMethod.NONE); @@ -499,6 +530,7 @@ public class UnpackContent extends AbstractProcessor { private static class EncryptedZipInputStreamCallback extends ZipInputStreamCallback { private final char[] password; + private final Charset filenameEncoding; private EncryptedZipInputStreamCallback( final Pattern fileFilter, @@ -506,15 +538,17 @@ public class UnpackContent extends AbstractProcessor { final FlowFile sourceFlowFile, final List unpacked, final String fragmentId, - final char[] password + final char[] password, + final Charset filenameEncoding ) { super(fileFilter, session, sourceFlowFile, unpacked, fragmentId); this.password = password; + this.filenameEncoding = filenameEncoding; } @Override public void process(final InputStream inputStream) throws IOException { - try (final ZipInputStream zipInputStream = new ZipInputStream(new BufferedInputStream(inputStream), password)) { + try (final ZipInputStream zipInputStream = new ZipInputStream(new BufferedInputStream(inputStream), password,filenameEncoding)) { LocalFileHeader zipEntry; while ((zipEntry = zipInputStream.getNextEntry()) != null) { processEntry(zipInputStream, zipEntry.isDirectory(), zipEntry.getFileName(), zipEntry.getEncryptionMethod()); diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestUnpackContent.java b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestUnpackContent.java index 392ebda777..3ffe92e568 100644 --- a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestUnpackContent.java +++ b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/java/org/apache/nifi/processors/standard/TestUnpackContent.java @@ -16,9 +16,12 @@ */ package org.apache.nifi.processors.standard; +import java.nio.charset.StandardCharsets; import net.lingala.zip4j.io.outputstream.ZipOutputStream; import net.lingala.zip4j.model.ZipParameters; import net.lingala.zip4j.model.enums.EncryptionMethod; +import org.apache.commons.io.Charsets; +import org.apache.commons.lang3.StringUtils; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.util.MockFlowFile; import org.apache.nifi.util.TestRunner; @@ -222,6 +225,92 @@ public class TestUnpackContent { flowFile.assertContentEquals(path.toFile()); } } + @Test + public void testZipEncodingField() { + final TestRunner unpackRunner = TestRunners.newTestRunner(new UnpackContent()); + unpackRunner.setProperty(UnpackContent.PACKAGING_FORMAT, UnpackContent.PackageFormat.ZIP_FORMAT.toString()); + unpackRunner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, "invalid-encoding"); + unpackRunner.assertNotValid(); + unpackRunner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, "IBM437"); + unpackRunner.assertValid(); + unpackRunner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, "Cp437"); + unpackRunner.assertValid(); + unpackRunner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, StandardCharsets.ISO_8859_1.name()); + unpackRunner.assertValid(); + unpackRunner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, StandardCharsets.UTF_8.name()); + unpackRunner.assertValid(); + + } + @Test + public void testZipWithCp437Encoding() throws IOException { + String zipFilename = "windows-with-cp437.zip"; + final TestRunner unpackRunner = TestRunners.newTestRunner(new UnpackContent()); + final TestRunner autoUnpackRunner = TestRunners.newTestRunner(new UnpackContent()); + unpackRunner.setProperty(UnpackContent.PACKAGING_FORMAT, UnpackContent.PackageFormat.ZIP_FORMAT.toString()); + unpackRunner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, "Cp437"); + unpackRunner.setProperty(UnpackContent.ALLOW_STORED_ENTRIES_WITH_DATA_DESCRIPTOR, "true"); // just forces this to be exercised + + autoUnpackRunner.setProperty(UnpackContent.PACKAGING_FORMAT, UnpackContent.PackageFormat.AUTO_DETECT_FORMAT.toString()); + autoUnpackRunner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, "Cp437"); + + unpackRunner.enqueue(dataPath.resolve(zipFilename)); + unpackRunner.enqueue(dataPath.resolve(zipFilename)); + + Map attributes = new HashMap<>(1); + attributes.put("mime.type", "application/zip"); + autoUnpackRunner.enqueue(dataPath.resolve(zipFilename), attributes); + autoUnpackRunner.enqueue(dataPath.resolve(zipFilename), attributes); + unpackRunner.run(2); + autoUnpackRunner.run(2); + + unpackRunner.assertTransferCount(UnpackContent.REL_FAILURE, 0); + autoUnpackRunner.assertTransferCount(UnpackContent.REL_FAILURE, 0); + + final List unpacked = + unpackRunner.getFlowFilesForRelationship(UnpackContent.REL_SUCCESS); + for (final MockFlowFile flowFile : unpacked) { + final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key()); + // In this test case only check for presence of `?` in filename and path for failure, since the zip was created on Windows, + // it will always output `?` if Cp437 encoding is not used during unpacking. The zip file also contains file and folder + // without special characters. + // As a result of these conditions, this test does not check for valid special character presence. + assertTrue(StringUtils.containsNone(filename, "?"), "filename contains '?': " + filename); + final String path = flowFile.getAttribute(CoreAttributes.PATH.key()); + assertTrue(StringUtils.containsNone(path, "?"), "path contains '?': " + path); + } + } + @Test + public void testEncryptedZipWithCp437Encoding() throws IOException { + final TestRunner runner = TestRunners.newTestRunner(new UnpackContent()); + runner.setProperty(UnpackContent.PACKAGING_FORMAT, UnpackContent.PackageFormat.ZIP_FORMAT.toString()); + runner.setProperty(UnpackContent.ALLOW_STORED_ENTRIES_WITH_DATA_DESCRIPTOR, "false"); + runner.setProperty(UnpackContent.ZIP_FILENAME_CHARSET, "Cp437"); + final String password = String.class.getSimpleName(); + runner.setProperty(UnpackContent.PASSWORD, password); + + final char[] streamPassword = password.toCharArray(); + final String contents = TestRunner.class.getCanonicalName(); + String specialChar = "\u00E4"; + String pathInZip = "path_with_special_%s_char/".formatted(specialChar); + String filename = "filename_with_special_char%s.txt".formatted(specialChar); + final byte[] zipEncrypted = createZipEncryptedCp437(EncryptionMethod.AES, streamPassword, contents,pathInZip.concat(filename)); + runner.enqueue(zipEncrypted); + runner.run(); + + runner.assertTransferCount(UnpackContent.REL_SUCCESS, 1); + runner.assertTransferCount(UnpackContent.REL_ORIGINAL, 1); + + final List unpacked = + runner.getFlowFilesForRelationship(UnpackContent.REL_SUCCESS); + for (final MockFlowFile flowFile : unpacked) { + final String outputFilename = flowFile.getAttribute(CoreAttributes.FILENAME.key()); + assertTrue(StringUtils.containsNone(outputFilename, "?"), "filename contains '?': " + outputFilename); + assertTrue(StringUtils.contains(outputFilename, specialChar), "filename missing '%s': %s".formatted(specialChar,outputFilename)); + final String path = flowFile.getAttribute(CoreAttributes.PATH.key()); + assertTrue(StringUtils.containsNone(path, "?"), "path contains '?': " + path); + assertTrue(StringUtils.contains(path, specialChar), "path missing '%s': %s".formatted(specialChar,path)); + } + } @Test public void testZipEncryptionZipStandard() throws IOException { @@ -526,4 +615,20 @@ public class TestUnpackContent { return outputStream.toByteArray(); } + + private byte[] createZipEncryptedCp437(final EncryptionMethod encryptionMethod, final char[] password, final String contents, String filename) throws IOException { + final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + final ZipOutputStream zipOutputStream = new ZipOutputStream(outputStream, password, Charsets.toCharset("Cp437")); + + final ZipParameters zipParameters = new ZipParameters(); + zipParameters.setEncryptionMethod(encryptionMethod); + zipParameters.setEncryptFiles(true); + zipParameters.setFileNameInZip(filename); + zipOutputStream.putNextEntry(zipParameters); + zipOutputStream.write(contents.getBytes()); + zipOutputStream.closeEntry(); + zipOutputStream.close(); + + return outputStream.toByteArray(); + } } diff --git a/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestUnpackContent/windows-with-cp437.zip b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestUnpackContent/windows-with-cp437.zip new file mode 100644 index 0000000000..9255a56d6e Binary files /dev/null and b/nifi-nar-bundles/nifi-standard-bundle/nifi-standard-processors/src/test/resources/TestUnpackContent/windows-with-cp437.zip differ