* Add text-extraction verification to integration-tests via a new abstract base FileHandler

* Fix NullPointerException found in some documents when running against the test-data
* Add support for extracting text from Dir-Entries WORKBOOK and BOOK to support some old/strangely formatted XLS files.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1662652 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2015-02-27 09:59:14 +00:00
parent 51a428c5ee
commit 76307fe94b
12 changed files with 107 additions and 22 deletions

View File

@ -253,20 +253,26 @@ public class TestAllFiles {
@Test @Test
public void testAllFiles() throws Exception { public void testAllFiles() throws Exception {
assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler); assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler);
InputStream stream = new BufferedInputStream(new FileInputStream(new File(ROOT_DIR, file)),100); File inputFile = new File(ROOT_DIR, file);
try { try {
handler.handleFile(stream); InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100);
try {
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", handler.handleFile(stream);
EXPECTED_FAILURES.contains(file));
} catch (Exception e) { assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
// check if we expect failure for this file EXPECTED_FAILURES.contains(file));
if(!EXPECTED_FAILURES.contains(file)) { } finally {
throw new Exception("While handling " + file, e); stream.close();
} }
} finally {
stream.close(); handler.handleExtracting(inputFile);
} } catch (Exception e) {
// check if we expect failure for this file
if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
throw new Exception("While handling " + file, e);
}
}
} }
private static String getExtension(String file) { private static String getExtension(String file) {
@ -282,5 +288,9 @@ public class TestAllFiles {
@Override @Override
public void handleFile(InputStream stream) throws Exception { public void handleFile(InputStream stream) throws Exception {
} }
@Override
public void handleExtracting(File file) throws Exception {
}
} }
} }

View File

@ -0,0 +1,55 @@
package org.apache.poi.stress;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import java.io.File;
import java.util.HashSet;
import java.util.Set;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
public abstract class AbstractFileHandler implements FileHandler {
public static final Set<String> EXPECTED_EXTRACTOR_FAILURES = new HashSet<String>();
static {
// password protected files
EXPECTED_EXTRACTOR_FAILURES.add("document/bug53475-password-is-pass.docx");
EXPECTED_EXTRACTOR_FAILURES.add("poifs/extenxls_pwd123.xlsx");
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protect.xlsx");
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_agile.docx");
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_sha512.xlsx");
// unsupported file-types, no supported OLE2 parts
EXPECTED_EXTRACTOR_FAILURES.add("hmef/quick-winmail.dat");
EXPECTED_EXTRACTOR_FAILURES.add("hmef/winmail-sample1.dat");
EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-simple.dat");
EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-with-attachments.dat");
EXPECTED_EXTRACTOR_FAILURES.add("hpsf/Test0313rur.adm");
EXPECTED_EXTRACTOR_FAILURES.add("hsmf/attachment_msg_pdf.msg");
EXPECTED_EXTRACTOR_FAILURES.add("poifs/Notes.ole2");
EXPECTED_EXTRACTOR_FAILURES.add("slideshow/testPPT.thmx");
}
public void handleExtracting(File file) throws Exception {
POITextExtractor extractor = ExtractorFactory.createExtractor(file);
try {
assertNotNull(extractor);
assertNotNull(extractor.getText());
// also try metadata
POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor();
assertNotNull(metadataExtractor.getText());
assertFalse("Expected Extraction to fail for file " + file + " and handler " + this + ", but did not fail!",
EXPECTED_EXTRACTOR_FAILURES.contains(file));
} catch (IllegalArgumentException e) {
if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
throw new Exception("While handling " + file, e);
}
} finally {
extractor.close();
}
}
}

View File

@ -16,6 +16,7 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.stress; package org.apache.poi.stress;
import java.io.File;
import java.io.InputStream; import java.io.InputStream;
/** /**
@ -34,4 +35,10 @@ public interface FileHandler {
* @throws Exception * @throws Exception
*/ */
void handleFile(InputStream stream) throws Exception; void handleFile(InputStream stream) throws Exception;
/**
* Ensures that extracting text from the given file
* is returning some text.
*/
void handleExtracting(File file) throws Exception;
} }

View File

@ -26,7 +26,7 @@ import org.apache.poi.hmef.attribute.MAPIAttribute;
import org.apache.poi.hmef.attribute.MAPIStringAttribute; import org.apache.poi.hmef.attribute.MAPIStringAttribute;
import org.junit.Test; import org.junit.Test;
public class HMEFFileHandler implements FileHandler { public class HMEFFileHandler extends AbstractFileHandler {
@Override @Override
public void handleFile(InputStream stream) throws Exception { public void handleFile(InputStream stream) throws Exception {

View File

@ -25,7 +25,7 @@ import org.apache.poi.hpsf.HPSFPropertiesOnlyDocument;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.Test; import org.junit.Test;
public class HPSFFileHandler implements FileHandler { public class HPSFFileHandler extends AbstractFileHandler {
@Override @Override
public void handleFile(InputStream stream) throws Exception { public void handleFile(InputStream stream) throws Exception {
HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream)); HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream));

View File

@ -16,6 +16,7 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.stress; package org.apache.poi.stress;
import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
@ -49,4 +50,10 @@ public class HSSFFileHandler extends SpreadsheetHandler {
stream.close(); stream.close();
} }
} }
// a test-case to test this locally without executing the full TestAllFiles
@Test
public void testExtractor() throws Exception {
handleExtracting(new File("test-data/spreadsheet/BOOK_in_capitals.xls"));
}
} }

View File

@ -25,7 +25,7 @@ import java.io.InputStream;
import org.apache.poi.POIDocument; import org.apache.poi.POIDocument;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class POIFSFileHandler implements FileHandler { public class POIFSFileHandler extends AbstractFileHandler {
@Override @Override
public void handleFile(InputStream stream) throws Exception { public void handleFile(InputStream stream) throws Exception {

View File

@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory; import org.apache.poi.ss.usermodel.WorkbookFactory;
public abstract class SpreadsheetHandler implements FileHandler { public abstract class SpreadsheetHandler extends AbstractFileHandler {
public void handleWorkbook(Workbook wb, String extension) throws IOException { public void handleWorkbook(Workbook wb, String extension) throws IOException {
// try to access some of the content // try to access some of the content
readContent(wb); readContent(wb);

View File

@ -25,7 +25,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.XSLFSlideShow; import org.apache.poi.xslf.XSLFSlideShow;
import org.junit.Test; import org.junit.Test;
public class XSLFFileHandler implements FileHandler { public class XSLFFileHandler extends AbstractFileHandler {
@Override @Override
public void handleFile(InputStream stream) throws Exception { public void handleFile(InputStream stream) throws Exception {
// ignore password protected files // ignore password protected files

View File

@ -22,7 +22,7 @@ import java.io.InputStream;
import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test; import org.junit.Test;
public class XWPFFileHandler implements FileHandler { public class XWPFFileHandler extends AbstractFileHandler {
@Override @Override
public void handleFile(InputStream stream) throws Exception { public void handleFile(InputStream stream) throws Exception {
// ignore password protected files // ignore password protected files

View File

@ -213,7 +213,9 @@ public class ExtractorFactory {
{ {
// Look for certain entries in the stream, to figure it // Look for certain entries in the stream, to figure it
// out from // out from
if (poifsDir.hasEntry("Workbook")) { if (poifsDir.hasEntry("Workbook") ||
// some XLS files have different entry-names
poifsDir.hasEntry("WORKBOOK") || poifsDir.hasEntry("BOOK")) {
if (getPreferEventExtractor()) { if (getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir); return new EventBasedExcelExtractor(poifsDir);
} }

View File

@ -80,7 +80,11 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
} }
POIXMLTextExtractor extractor = POIXMLTextExtractor extractor =
new XSSFExcelExtractor(args[0]); new XSSFExcelExtractor(args[0]);
System.out.println(extractor.getText()); try {
System.out.println(extractor.getText());
} finally {
extractor.close();
}
} }
/** /**
@ -237,7 +241,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
if (type == Cell.CELL_TYPE_NUMERIC) { if (type == Cell.CELL_TYPE_NUMERIC) {
CellStyle cs = cell.getCellStyle(); CellStyle cs = cell.getCellStyle();
if (cs.getDataFormatString() != null) { if (cs != null && cs.getDataFormatString() != null) {
text.append(formatter.formatRawCellContents( text.append(formatter.formatRawCellContents(
cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString() cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString()
)); ));