mirror of https://github.com/apache/poi.git
* Add text-extraction verification to integration-tests via a new abstract base FileHandler
* Fix NullPointerException found in some documents when running against the test-data * Add support for extracting text from Dir-Entries WORKBOOK and BOOK to support some old/strangely formatted XLS files. git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1662652 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
51a428c5ee
commit
76307fe94b
|
@ -253,20 +253,26 @@ public class TestAllFiles {
|
||||||
@Test
|
@Test
|
||||||
public void testAllFiles() throws Exception {
|
public void testAllFiles() throws Exception {
|
||||||
assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler);
|
assertNotNull("Unknown file extension for file: " + file + ": " + getExtension(file), handler);
|
||||||
InputStream stream = new BufferedInputStream(new FileInputStream(new File(ROOT_DIR, file)),100);
|
File inputFile = new File(ROOT_DIR, file);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
handler.handleFile(stream);
|
InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100);
|
||||||
|
try {
|
||||||
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
|
handler.handleFile(stream);
|
||||||
EXPECTED_FAILURES.contains(file));
|
|
||||||
} catch (Exception e) {
|
assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!",
|
||||||
// check if we expect failure for this file
|
EXPECTED_FAILURES.contains(file));
|
||||||
if(!EXPECTED_FAILURES.contains(file)) {
|
} finally {
|
||||||
throw new Exception("While handling " + file, e);
|
stream.close();
|
||||||
}
|
}
|
||||||
} finally {
|
|
||||||
stream.close();
|
handler.handleExtracting(inputFile);
|
||||||
}
|
} catch (Exception e) {
|
||||||
|
// check if we expect failure for this file
|
||||||
|
if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
|
||||||
|
throw new Exception("While handling " + file, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String getExtension(String file) {
|
private static String getExtension(String file) {
|
||||||
|
@ -282,5 +288,9 @@ public class TestAllFiles {
|
||||||
@Override
|
@Override
|
||||||
public void handleFile(InputStream stream) throws Exception {
|
public void handleFile(InputStream stream) throws Exception {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void handleExtracting(File file) throws Exception {
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
package org.apache.poi.stress;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.poi.POITextExtractor;
|
||||||
|
import org.apache.poi.extractor.ExtractorFactory;
|
||||||
|
|
||||||
|
public abstract class AbstractFileHandler implements FileHandler {
|
||||||
|
public static final Set<String> EXPECTED_EXTRACTOR_FAILURES = new HashSet<String>();
|
||||||
|
static {
|
||||||
|
// password protected files
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("document/bug53475-password-is-pass.docx");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("poifs/extenxls_pwd123.xlsx");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protect.xlsx");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_agile.docx");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("poifs/protected_sha512.xlsx");
|
||||||
|
|
||||||
|
// unsupported file-types, no supported OLE2 parts
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("hmef/quick-winmail.dat");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("hmef/winmail-sample1.dat");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-simple.dat");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("hmef/bug52400-winmail-with-attachments.dat");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("hpsf/Test0313rur.adm");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("hsmf/attachment_msg_pdf.msg");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("poifs/Notes.ole2");
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.add("slideshow/testPPT.thmx");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void handleExtracting(File file) throws Exception {
|
||||||
|
POITextExtractor extractor = ExtractorFactory.createExtractor(file);
|
||||||
|
try {
|
||||||
|
assertNotNull(extractor);
|
||||||
|
|
||||||
|
assertNotNull(extractor.getText());
|
||||||
|
|
||||||
|
// also try metadata
|
||||||
|
POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor();
|
||||||
|
assertNotNull(metadataExtractor.getText());
|
||||||
|
|
||||||
|
assertFalse("Expected Extraction to fail for file " + file + " and handler " + this + ", but did not fail!",
|
||||||
|
EXPECTED_EXTRACTOR_FAILURES.contains(file));
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) {
|
||||||
|
throw new Exception("While handling " + file, e);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
extractor.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,6 +16,7 @@
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.stress;
|
package org.apache.poi.stress;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -34,4 +35,10 @@ public interface FileHandler {
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
void handleFile(InputStream stream) throws Exception;
|
void handleFile(InputStream stream) throws Exception;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ensures that extracting text from the given file
|
||||||
|
* is returning some text.
|
||||||
|
*/
|
||||||
|
void handleExtracting(File file) throws Exception;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.poi.hmef.attribute.MAPIAttribute;
|
||||||
import org.apache.poi.hmef.attribute.MAPIStringAttribute;
|
import org.apache.poi.hmef.attribute.MAPIStringAttribute;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class HMEFFileHandler implements FileHandler {
|
public class HMEFFileHandler extends AbstractFileHandler {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void handleFile(InputStream stream) throws Exception {
|
public void handleFile(InputStream stream) throws Exception {
|
||||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.poi.hpsf.HPSFPropertiesOnlyDocument;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class HPSFFileHandler implements FileHandler {
|
public class HPSFFileHandler extends AbstractFileHandler {
|
||||||
@Override
|
@Override
|
||||||
public void handleFile(InputStream stream) throws Exception {
|
public void handleFile(InputStream stream) throws Exception {
|
||||||
HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream));
|
HPSFPropertiesOnlyDocument hpsf = new HPSFPropertiesOnlyDocument(new POIFSFileSystem(stream));
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.stress;
|
package org.apache.poi.stress;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
@ -49,4 +50,10 @@ public class HSSFFileHandler extends SpreadsheetHandler {
|
||||||
stream.close();
|
stream.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// a test-case to test this locally without executing the full TestAllFiles
|
||||||
|
@Test
|
||||||
|
public void testExtractor() throws Exception {
|
||||||
|
handleExtracting(new File("test-data/spreadsheet/BOOK_in_capitals.xls"));
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -25,7 +25,7 @@ import java.io.InputStream;
|
||||||
import org.apache.poi.POIDocument;
|
import org.apache.poi.POIDocument;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
public class POIFSFileHandler implements FileHandler {
|
public class POIFSFileHandler extends AbstractFileHandler {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void handleFile(InputStream stream) throws Exception {
|
public void handleFile(InputStream stream) throws Exception {
|
||||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.poi.ss.usermodel.Sheet;
|
||||||
import org.apache.poi.ss.usermodel.Workbook;
|
import org.apache.poi.ss.usermodel.Workbook;
|
||||||
import org.apache.poi.ss.usermodel.WorkbookFactory;
|
import org.apache.poi.ss.usermodel.WorkbookFactory;
|
||||||
|
|
||||||
public abstract class SpreadsheetHandler implements FileHandler {
|
public abstract class SpreadsheetHandler extends AbstractFileHandler {
|
||||||
public void handleWorkbook(Workbook wb, String extension) throws IOException {
|
public void handleWorkbook(Workbook wb, String extension) throws IOException {
|
||||||
// try to access some of the content
|
// try to access some of the content
|
||||||
readContent(wb);
|
readContent(wb);
|
||||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.xslf.XSLFSlideShow;
|
import org.apache.poi.xslf.XSLFSlideShow;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class XSLFFileHandler implements FileHandler {
|
public class XSLFFileHandler extends AbstractFileHandler {
|
||||||
@Override
|
@Override
|
||||||
public void handleFile(InputStream stream) throws Exception {
|
public void handleFile(InputStream stream) throws Exception {
|
||||||
// ignore password protected files
|
// ignore password protected files
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.InputStream;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class XWPFFileHandler implements FileHandler {
|
public class XWPFFileHandler extends AbstractFileHandler {
|
||||||
@Override
|
@Override
|
||||||
public void handleFile(InputStream stream) throws Exception {
|
public void handleFile(InputStream stream) throws Exception {
|
||||||
// ignore password protected files
|
// ignore password protected files
|
||||||
|
|
|
@ -213,7 +213,9 @@ public class ExtractorFactory {
|
||||||
{
|
{
|
||||||
// Look for certain entries in the stream, to figure it
|
// Look for certain entries in the stream, to figure it
|
||||||
// out from
|
// out from
|
||||||
if (poifsDir.hasEntry("Workbook")) {
|
if (poifsDir.hasEntry("Workbook") ||
|
||||||
|
// some XLS files have different entry-names
|
||||||
|
poifsDir.hasEntry("WORKBOOK") || poifsDir.hasEntry("BOOK")) {
|
||||||
if (getPreferEventExtractor()) {
|
if (getPreferEventExtractor()) {
|
||||||
return new EventBasedExcelExtractor(poifsDir);
|
return new EventBasedExcelExtractor(poifsDir);
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,7 +80,11 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
|
||||||
}
|
}
|
||||||
POIXMLTextExtractor extractor =
|
POIXMLTextExtractor extractor =
|
||||||
new XSSFExcelExtractor(args[0]);
|
new XSSFExcelExtractor(args[0]);
|
||||||
System.out.println(extractor.getText());
|
try {
|
||||||
|
System.out.println(extractor.getText());
|
||||||
|
} finally {
|
||||||
|
extractor.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -237,7 +241,7 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
|
||||||
if (type == Cell.CELL_TYPE_NUMERIC) {
|
if (type == Cell.CELL_TYPE_NUMERIC) {
|
||||||
CellStyle cs = cell.getCellStyle();
|
CellStyle cs = cell.getCellStyle();
|
||||||
|
|
||||||
if (cs.getDataFormatString() != null) {
|
if (cs != null && cs.getDataFormatString() != null) {
|
||||||
text.append(formatter.formatRawCellContents(
|
text.append(formatter.formatRawCellContents(
|
||||||
cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString()
|
cell.getNumericCellValue(), cs.getDataFormat(), cs.getDataFormatString()
|
||||||
));
|
));
|
||||||
|
|
Loading…
Reference in New Issue