From 76307fe94bb6a3555967372ff7879a6adcb0f0f7 Mon Sep 17 00:00:00 2001 From: Dominik Stadler Date: Fri, 27 Feb 2015 09:59:14 +0000 Subject: [PATCH] * Add text-extraction verification to integration-tests via a new abstract base FileHandler * Fix NullPointerException found in some documents when running against the test-data * Add support for extracting text from Dir-Entries WORKBOOK and BOOK to support some old/strangely formatted XLS files. git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1662652 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/poi/TestAllFiles.java | 36 +++++++----- .../poi/stress/AbstractFileHandler.java | 55 +++++++++++++++++++ .../org/apache/poi/stress/FileHandler.java | 7 +++ .../apache/poi/stress/HMEFFileHandler.java | 2 +- .../apache/poi/stress/HPSFFileHandler.java | 2 +- .../apache/poi/stress/HSSFFileHandler.java | 7 +++ .../apache/poi/stress/POIFSFileHandler.java | 2 +- .../apache/poi/stress/SpreadsheetHandler.java | 2 +- .../apache/poi/stress/XSLFFileHandler.java | 2 +- .../apache/poi/stress/XWPFFileHandler.java | 2 +- .../poi/extractor/ExtractorFactory.java | 4 +- .../xssf/extractor/XSSFExcelExtractor.java | 8 ++- 12 files changed, 107 insertions(+), 22 deletions(-) create mode 100644 src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index 4608303537..d0439b40fd 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -253,20 +253,26 @@ public class TestAllFiles { @Test public void testAllFiles() throws Exception { assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler); - InputStream stream = new BufferedInputStream(new FileInputStream(new File(ROOT_DIR, file)),100); + File inputFile = new File(ROOT_DIR, file); + try { - handler.handleFile(stream); - - assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", - EXPECTED_FAILURES.contains(file)); - } catch (Exception e) { - // check if we expect failure for this file - if(!EXPECTED_FAILURES.contains(file)) { - throw new Exception("While handling " + file, e); - } - } finally { - stream.close(); - } + InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100); + try { + handler.handleFile(stream); + + assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", + EXPECTED_FAILURES.contains(file)); + } finally { + stream.close(); + } + + handler.handleExtracting(inputFile); + } catch (Exception e) { + // check if we expect failure for this file + if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) { + throw new Exception("While handling " + file, e); + } + } } private static String getExtension(String file) { @@ -282,5 +288,9 @@ public class TestAllFiles { @Override public void handleFile(InputStream stream) throws Exception { } + + @Override + public void handleExtracting(File file) throws Exception { + } } } diff --git a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java new file mode 100644 index 0000000000..85ebb1b451 --- /dev/null +++ b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java @@ -0,0 +1,55 @@ +package org.apache.poi.stress; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; + +import java.io.File; +import java.util.HashSet; +import java.util.Set; + +import org.apache.poi.POITextExtractor; +import org.apache.poi.extractor.ExtractorFactory; + +public abstract class AbstractFileHandler implements FileHandler { + public static final Set EXPECTED_EXTRACTOR_FAILURES = new HashSet(); + static { + // password protected files + EXPECTED_EXTRACTOR_FAILURES.add("document/bug53475-password-is-pass.docx"); + EXPECTED_EXTRACTOR_FAILURES.add("poifs/extenxls_pwd123.xlsx"); + EXPECTED_EXTRACTOR_FAILURES.add("poifs/protect.xlsx"); + EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_agile.docx"); + EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_sha512.xlsx"); + + // unsupported file-types, no supported OLE2 parts + EXPECTED_EXTRACTOR_FAILURES.add("hmef/quick-winmail.dat"); + EXPECTED_EXTRACTOR_FAILURES.add("hmef/winmail-sample1.dat"); + EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-simple.dat"); + EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-with-attachments.dat"); + EXPECTED_EXTRACTOR_FAILURES.add("hpsf/Test0313rur.adm"); + EXPECTED_EXTRACTOR_FAILURES.add("hsmf/attachment_msg_pdf.msg"); + EXPECTED_EXTRACTOR_FAILURES.add("poifs/Notes.ole2"); + EXPECTED_EXTRACTOR_FAILURES.add("slideshow/testPPT.thmx"); + } + + public void handleExtracting(File file) throws Exception { + POITextExtractor extractor = ExtractorFactory.createExtractor(file); + try { + assertNotNull(extractor); + + assertNotNull(extractor.getText()); + + // also try metadata + POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor(); + assertNotNull(metadataExtractor.getText()); + + assertFalse("Expected Extraction to fail for file " + file + " and handler " + this + ", but did not fail!", + EXPECTED_EXTRACTOR_FAILURES.contains(file)); + } catch (IllegalArgumentException e) { + if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) { + throw new Exception("While handling " + file, e); + } + } finally { + extractor.close(); + } + } +} diff --git a/src/integrationtest/org/apache/poi/stress/FileHandler.java b/src/integrationtest/org/apache/poi/stress/FileHandler.java index e6f3385f02..ce2991b0bc 100644 --- a/src/integrationtest/org/apache/poi/stress/FileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/FileHandler.java @@ -16,6 +16,7 @@ ==================================================================== */ package org.apache.poi.stress; +import java.io.File; import java.io.InputStream; /** @@ -34,4 +35,10 @@ public interface FileHandler { * @throws Exception */ void handleFile(InputStream stream) throws Exception; + + /** + * Ensures that extracting text from the given file + * is returning some text. + */ + void handleExtracting(File file) throws Exception; } diff --git a/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java b/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java index dfa8750058..9f492bf0ed 100644 --- a/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/HMEFFileHandler.java @@ -26,7 +26,7 @@ import org.apache.poi.hmef.attribute.MAPIAttribute; import org.apache.poi.hmef.attribute.MAPIStringAttribute; import org.junit.Test; -public class HMEFFileHandler implements FileHandler { +public class HMEFFileHandler extends AbstractFileHandler { @Override public void handleFile(InputStream stream) throws Exception { diff --git a/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java b/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java index b7d846ae62..477ee859cb 100644 --- a/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/HPSFFileHandler.java @@ -25,7 +25,7 @@ import org.apache.poi.hpsf.HPSFPropertiesOnlyDocument; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.junit.Test; -public class HPSFFileHandler implements FileHandler { +public class HPSFFileHandler extends AbstractFileHandler { @Override public void handleFile(InputStream stream) throws Exception { HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream)); diff --git a/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java b/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java index 19dbd97a0e..dd579c4dba 100644 --- a/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/HSSFFileHandler.java @@ -16,6 +16,7 @@ ==================================================================== */ package org.apache.poi.stress; +import java.io.File; import java.io.FileInputStream; import java.io.InputStream; @@ -49,4 +50,10 @@ public class HSSFFileHandler extends SpreadsheetHandler { stream.close(); } } + + // a test-case to test this locally without executing the full TestAllFiles + @Test + public void testExtractor() throws Exception { + handleExtracting(new File("test-data/spreadsheet/BOOK_in_capitals.xls")); + } } \ No newline at end of file diff --git a/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java b/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java index 31deac7106..5c4a36e3ca 100644 --- a/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/POIFSFileHandler.java @@ -25,7 +25,7 @@ import java.io.InputStream; import org.apache.poi.POIDocument; import org.apache.poi.poifs.filesystem.POIFSFileSystem; -public class POIFSFileHandler implements FileHandler { +public class POIFSFileHandler extends AbstractFileHandler { @Override public void handleFile(InputStream stream) throws Exception { diff --git a/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java b/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java index aad703ce98..f12bbd2de5 100644 --- a/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java +++ b/src/integrationtest/org/apache/poi/stress/SpreadsheetHandler.java @@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; -public abstract class SpreadsheetHandler implements FileHandler { +public abstract class SpreadsheetHandler extends AbstractFileHandler { public void handleWorkbook(Workbook wb, String extension) throws IOException { // try to access some of the content readContent(wb); diff --git a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java index 3464218fd9..e6cbb184b2 100644 --- a/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/XSLFFileHandler.java @@ -25,7 +25,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xslf.XSLFSlideShow; import org.junit.Test; -public class XSLFFileHandler implements FileHandler { +public class XSLFFileHandler extends AbstractFileHandler { @Override public void handleFile(InputStream stream) throws Exception { // ignore password protected files diff --git a/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java b/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java index a96d46da31..47c18d8aa0 100644 --- a/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/XWPFFileHandler.java @@ -22,7 +22,7 @@ import java.io.InputStream; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.junit.Test; -public class XWPFFileHandler implements FileHandler { +public class XWPFFileHandler extends AbstractFileHandler { @Override public void handleFile(InputStream stream) throws Exception { // ignore password protected files diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 65d1e3d693..a0b6b5db17 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -213,7 +213,9 @@ public class ExtractorFactory { { // Look for certain entries in the stream, to figure it // out from - if (poifsDir.hasEntry("Workbook")) { + if (poifsDir.hasEntry("Workbook") || + // some XLS files have different entry-names + poifsDir.hasEntry("WORKBOOK") || poifsDir.hasEntry("BOOK")) { if (getPreferEventExtractor()) { return new EventBasedExcelExtractor(poifsDir); } diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java index 6f43ba126b..39ef5be8a4 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFExcelExtractor.java @@ -80,7 +80,11 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor } POIXMLTextExtractor extractor = new XSSFExcelExtractor(args[0]); - System.out.println(extractor.getText()); + try { + System.out.println(extractor.getText()); + } finally { + extractor.close(); + } } /** @@ -237,7 +241,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor if (type == Cell.CELL_TYPE_NUMERIC) { CellStyle cs = cell.getCellStyle(); - if (cs.getDataFormatString() != null) { + if (cs != null && cs.getDataFormatString() != null) { text.append(formatter.formatRawCellContents( cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString() ));