Adjust POIFileScanner for mass regression testing

Fix closing file-handles when scanning for files to test
Refactor to make it easier to test
Some more tests

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1875267 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2020-03-16 19:55:04 +00:00
parent 9de555bfd4
commit 260fec1a8b
1 changed files with 102 additions and 67 deletions

View File

@ -23,10 +23,15 @@ import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.stress.FileHandler;
import org.apache.poi.stress.XSSFFileHandler;
import org.apache.poi.util.SuppressForbidden;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.tools.ant.DirectoryScanner;
import org.junit.Ignore;
import org.junit.Test;
import java.io.File;
import java.io.FileInputStream;
@ -37,6 +42,8 @@ import java.util.Collection;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
/**
* Helper class to scan a folder for files and return a collection of
* found files together with the matching {@link FileHandler}.
@ -55,7 +62,6 @@ public class POIFileScanner {
* @throws IOException If determining the file-type fails
*/
public static Collection<Map.Entry<String, FileHandler>> scan(File rootDir) throws IOException {
DirectoryScanner scanner = new DirectoryScanner();
scanner.setBasedir(rootDir);
@ -65,10 +71,11 @@ public class POIFileScanner {
scanner.scan();
System.out.println("Handling " + scanner.getIncludedFiles().length + " files");
String[] includedFiles = scanner.getIncludedFiles();
System.out.println("Handling " + includedFiles.length + " files");
List<Map.Entry<String, FileHandler>> files = new ArrayList<>();
for(String file : scanner.getIncludedFiles()) {
for(String file : includedFiles) {
// breaks files with slash in their name on Linux:
// file = file.replace('\\', '/'); // ... failures/handlers lookup doesn't work on windows otherwise
@ -100,71 +107,99 @@ public class POIFileScanner {
protected static FileHandler getFileHandler(File rootDir, String file) throws IOException {
FileHandler fileHandler = TestAllFiles.HANDLERS.get(TestAllFiles.getExtension(file));
if(fileHandler == null) {
File testFile = new File(rootDir, file);
FileMagic magic = FileMagic.valueOf(testFile);
// if we have a file-type that we can read, but no extension, we try to determine the
// file type manually
switch(magic) {
case OLE2: {
try {
try (POIFSFileSystem fs = new POIFSFileSystem(testFile, true)) {
HSSFWorkbook.getWorkbookDirEntryName(fs.getRoot());
}
// we did not get an exception, so it seems this is a HSSFWorkbook
fileHandler = TestAllFiles.HANDLERS.get(".xls");
} catch (IOException | RuntimeException e) {
try {
try (FileInputStream istream = new FileInputStream(testFile)) {
try (HWPFDocument ignored = new HWPFDocument(istream)) {
// seems to be a valid document
fileHandler = TestAllFiles.HANDLERS.get(".doc");
}
}
} catch (IOException | RuntimeException e2) {
System.out.println("Could not open POIFSFileSystem for OLE2 file " + testFile + ": " + e + " and " + e2);
fileHandler = new TestAllFiles.NullFileHandler();
}
}
break;
}
case OOXML: {
try {
WorkbookFactory.create(testFile);
// seems to be a valid workbook
fileHandler = TestAllFiles.HANDLERS.get(".xlsx");
} catch (IOException | RuntimeException e) {
try {
try (FileInputStream is = new FileInputStream(testFile)) {
try (XWPFDocument ignored = new XWPFDocument(is)) {
// seems to be a valid document
fileHandler = TestAllFiles.HANDLERS.get(".docx");
}
}
} catch (IOException | RuntimeException e2) {
System.out.println("Could not open POIFSFileSystem for OOXML file " + testFile + ": " + e + " and " + e2);
fileHandler = new TestAllFiles.NullFileHandler();
}
}
break;
}
// do not warn about a few detected file types
case RTF:
case PDF:
case HTML:
fileHandler = new TestAllFiles.NullFileHandler();
break;
}
if(fileHandler == null) {
System.out.println("Did not get a handler for extension " + TestAllFiles.getExtension(file) +
" of file " + file + ": " + magic);
fileHandler = new TestAllFiles.NullFileHandler();
}
// we could not detect a type of file based on the extension, so we
// need to take a close look at the file
fileHandler = detectUnnamedFile(rootDir, file);
}
return fileHandler;
}
private static FileHandler detectUnnamedFile(File rootDir, String file) throws IOException {
File testFile = new File(rootDir, file);
// find out if it looks like OLE2 (HSSF, HSLF, HWPF, ...) or OOXML (XSSF, XSLF, XWPF, ...)
// and then determine the file type accordingly
FileMagic magic = FileMagic.valueOf(testFile);
switch (magic) {
case OLE2: {
try {
try (POIFSFileSystem fs = new POIFSFileSystem(testFile, true)) {
HSSFWorkbook.getWorkbookDirEntryName(fs.getRoot());
}
// we did not get an exception, so it seems this is a HSSFWorkbook
return TestAllFiles.HANDLERS.get(".xls");
} catch (IOException | RuntimeException e) {
try {
try (FileInputStream istream = new FileInputStream(testFile)) {
try (HWPFDocument ignored = new HWPFDocument(istream)) {
// seems to be a valid document
return TestAllFiles.HANDLERS.get(".doc");
}
}
} catch (IOException | RuntimeException e2) {
System.out.println("Could not open POIFSFileSystem for OLE2 file " + testFile + ": " + e + " and " + e2);
return new TestAllFiles.NullFileHandler();
}
}
}
case OOXML: {
try {
try (Workbook ignored = WorkbookFactory.create(testFile, null, true)) {
// seems to be a valid workbook
return TestAllFiles.HANDLERS.get(".xlsx");
}
} catch (IOException | RuntimeException e) {
try {
try (FileInputStream is = new FileInputStream(testFile)) {
try (XWPFDocument ignored = new XWPFDocument(is)) {
// seems to be a valid document
return TestAllFiles.HANDLERS.get(".docx");
}
}
} catch (IOException | RuntimeException e2) {
System.out.println("Could not open POIFSFileSystem for OOXML file " + testFile + ": " + e + " and " + e2);
return new TestAllFiles.NullFileHandler();
}
}
}
// do not warn about a few detected file types
case RTF:
case PDF:
case HTML:
return new TestAllFiles.NullFileHandler();
}
System.out.println("Did not get a handler for extension " + TestAllFiles.getExtension(file) +
" of file " + file + ": " + magic);
return new TestAllFiles.NullFileHandler();
}
@Ignore
@Test
@SuppressForbidden("Just an ignored test")
public void testInvalidFile() throws IOException, InterruptedException {
FileHandler fileHandler = POIFileScanner.getFileHandler(new File("/usbc/CommonCrawl"),
"www.bgs.ac.uk_downloads_directdownload.cfm_id=2362&noexcl=true&t=west_20sussex_20-_20building_20stone_20quarries");
assertEquals(XSSFFileHandler.class, fileHandler.getClass());
// to show the output from ZipFile() from commons-compress
// although I did not find out yet why the ZipFile is not closed here
System.gc();
Thread.sleep(1000);
System.gc();
Thread.sleep(1000);
}
@Test
public void testDetectUnnamedFile() throws IOException {
POIFileScanner.detectUnnamedFile(new File("test-data/spreadsheet"), "49156.xlsx");
}
@Test
public void test() throws IOException {
POIFileScanner.scan(new File("test-data"));
}
}