[bug-65581] support configurable temp file threshold

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1893421 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
PJ Fanning 2021-09-18 20:06:25 +00:00
parent 944ea414cd
commit 8d7af95fed
4 changed files with 119 additions and 11 deletions

View File

@ -17,37 +17,82 @@
package org.apache.poi.openxml4j.util;
import java.io.IOException;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.IOException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.TempFile;
/**
* So we can close the real zip entry and still
* effectively work with it.
* Holds the (decompressed!) data in memory, so
* Holds the (decompressed!) data in memory (or since POI 5.1.0, possibly in a temp file), so
* close this as soon as you can!
* @see ZipInputStreamZipEntrySource#setThresholdBytesForTempFiles(int)
*/
/* package */ class ZipArchiveFakeEntry extends ZipArchiveEntry {
private final byte[] data;
/* package */ class ZipArchiveFakeEntry extends ZipArchiveEntry implements Closeable {
private static Logger LOG = LogManager.getLogger(ZipArchiveFakeEntry.class);
private byte[] data;
private File tempFile;
ZipArchiveFakeEntry(ZipArchiveEntry entry, InputStream inp) throws IOException {
super(entry.getName());
final long entrySize = entry.getSize();
if (entrySize < -1 || entrySize>=Integer.MAX_VALUE) {
throw new IOException("ZIP entry size is too large or invalid");
}
final int threshold = ZipInputStreamZipEntrySource.getThresholdBytesForTempFiles();
if (threshold >= 0 && entrySize >= threshold) {
tempFile = TempFile.createTempFile("poi-zip-entry", ".tmp");
LOG.atInfo().log("created for temp file {} for zip entry {} of size {} bytes",
tempFile.getAbsolutePath(), entry.getName(), entrySize);
IOUtils.copy(inp, tempFile);
} else {
if (entrySize < -1 || entrySize >= Integer.MAX_VALUE) {
throw new IOException("ZIP entry size is too large or invalid");
}
// Grab the de-compressed contents for later
data = (entrySize == -1) ? IOUtils.toByteArray(inp) : IOUtils.toByteArray(inp, (int)entrySize);
// Grab the de-compressed contents for later
data = (entrySize == -1) ? IOUtils.toByteArray(inp) : IOUtils.toByteArray(inp, (int)entrySize);
}
}
/**
* Returns zip entry.
* @return input stream
* @throws RuntimeException since POI 5.1.0,
* a RuntimeException can occur if the optional temp file has been removed
* @see ZipInputStreamZipEntrySource#setThresholdBytesForTempFiles(int)
*/
public InputStream getInputStream() {
return new UnsynchronizedByteArrayInputStream(data);
if (tempFile != null) {
try {
return new FileInputStream(tempFile);
} catch (FileNotFoundException e) {
throw new RuntimeException("temp file " + tempFile.getAbsolutePath() + " is missing");
}
} else {
return new UnsynchronizedByteArrayInputStream(data);
}
}
/**
* Deletes any temp files and releases any byte arrays.
* @throws IOException
* @since POI 5.1.0
*/
@Override
public void close() throws IOException {
data = null;
if (tempFile != null) {
tempFile.delete();
}
}
}

View File

@ -34,15 +34,40 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
* done, to free up that memory!
*/
public class ZipInputStreamZipEntrySource implements ZipEntrySource {
private static int thresholdForTempFiles = -1;
private final Map<String, ZipArchiveFakeEntry> zipEntries = new HashMap<>();
private InputStream streamToClose;
/**
* Set the threshold at which it a zip entry is regarded as too large for holding in memory
* and the data is put in a temp file instead
* @param thresholdBytes number of bytes at which a zip entry is regarded as too large for holding in memory
* and the data is put in a temp file instead - defaults to -1 meaning temp files are not used
* and that zip entries with more than 2GB of data after decompressing will fail, 0 means all
* zip entries are stored in temp files. A threshold like 50000000 (approx 50Mb is recommended)
* @since POI 5.1.0
*/
public static void setThresholdBytesForTempFiles(int thresholdBytes) {
thresholdForTempFiles = thresholdBytes;
}
/**
* Get the threshold at which it a zip entry is regarded as too large for holding in memory
* and the data is put in a temp file instead (defaults to -1 meaning temp files are not used)
* @return threshold in bytes
* @since POI 5.1.0
*/
public static int getThresholdBytesForTempFiles() {
return thresholdForTempFiles;
}
/**
* Reads all the entries from the ZipInputStream
* into memory, and don't close (since POI 4.0.1) the source stream.
* We'll then eat lots of memory, but be able to
* work with the entries at-will.
* @see #setThresholdBytesForTempFiles
*/
public ZipInputStreamZipEntrySource(ZipArchiveThresholdInputStream inp) throws IOException {
for (;;) {
@ -69,6 +94,10 @@ public class ZipInputStreamZipEntrySource implements ZipEntrySource {
@Override
public void close() throws IOException {
for (ZipArchiveFakeEntry entry : zipEntries.values()) {
entry.close();
}
// Free the memory
zipEntries.clear();

View File

@ -53,6 +53,7 @@ import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.openxml4j.opc.internal.MemoryPackagePart;
import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource;
import org.apache.poi.ss.tests.usermodel.BaseTestXWorkbook;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
@ -167,6 +168,26 @@ public final class TestXSSFWorkbook extends BaseTestXWorkbook {
}
}
@Test
void existingWithZipEntryTempFiles() throws Exception {
int defaultThreshold = ZipInputStreamZipEntrySource.getThresholdBytesForTempFiles();
ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(100);
try (XSSFWorkbook workbook = openSampleWorkbook("Formatting.xlsx");
OPCPackage pkg = OPCPackage.open(openSampleFileStream("Formatting.xlsx"))) {
assertNotNull(workbook.getSharedStringSource());
assertNotNull(workbook.getStylesSource());
// And check a few low level bits too
PackagePart wbPart = pkg.getPart(PackagingURIHelper.createPartName("/xl/workbook.xml"));
// Links to the three sheets, shared, styles and themes
assertTrue(wbPart.hasRelationships());
assertEquals(6, wbPart.getRelationships().size());
} finally {
ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(defaultThreshold);
}
}
@Test
void getCellStyleAt() throws IOException{
try (XSSFWorkbook workbook = new XSSFWorkbook()) {

View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<Configuration status="WARN">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console"/>
</Root>
</Loggers>
</Configuration>