diff --git a/src/integrationtest/org/apache/poi/TestAllFiles.java b/src/integrationtest/org/apache/poi/TestAllFiles.java index e8de685f9c..8a66024f7b 100644 --- a/src/integrationtest/org/apache/poi/TestAllFiles.java +++ b/src/integrationtest/org/apache/poi/TestAllFiles.java @@ -31,6 +31,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.stress.*; import org.apache.tools.ant.DirectoryScanner; import org.junit.Test; @@ -162,6 +163,20 @@ public class TestAllFiles { HANDLERS.put("spreadsheet/test_properties1", new NullFileHandler()); } + // Old Word Documents where we can at least extract some text + private static final Set OLD_FILES = new HashSet(); + static { + OLD_FILES.add("document/Bug49933.doc"); + OLD_FILES.add("document/Bug51944.doc"); + OLD_FILES.add("document/Word6.doc"); + OLD_FILES.add("document/Word6_sections.doc"); + OLD_FILES.add("document/Word6_sections2.doc"); + OLD_FILES.add("document/Word95.doc"); + OLD_FILES.add("document/word95err.doc"); + OLD_FILES.add("hpsf/TestMickey.doc"); + OLD_FILES.add("document/52117.doc"); + } + private static final Set EXPECTED_FAILURES = new HashSet(); static { // password protected files @@ -202,15 +217,7 @@ public class TestAllFiles { EXPECTED_FAILURES.add("spreadsheet/43493.xls"); EXPECTED_FAILURES.add("spreadsheet/46904.xls"); EXPECTED_FAILURES.add("document/56880.doc"); - EXPECTED_FAILURES.add("document/Bug49933.doc"); EXPECTED_FAILURES.add("document/Bug50955.doc"); - EXPECTED_FAILURES.add("document/Bug51944.doc"); - EXPECTED_FAILURES.add("document/Word6.doc"); - EXPECTED_FAILURES.add("document/Word6_sections.doc"); - EXPECTED_FAILURES.add("document/Word6_sections2.doc"); - EXPECTED_FAILURES.add("document/Word95.doc"); - EXPECTED_FAILURES.add("document/word95err.doc"); - EXPECTED_FAILURES.add("hpsf/TestMickey.doc"); EXPECTED_FAILURES.add("slideshow/PPT95.ppt"); EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DCTermsNamespaceLimitedUseFAIL.docx"); EXPECTED_FAILURES.add("openxml4j/OPCCompliance_CoreProperties_DoNotUseCompatibilityMarkupFAIL.docx"); @@ -269,17 +276,29 @@ public class TestAllFiles { File inputFile = new File(ROOT_DIR, file); try { - InputStream stream = new BufferedInputStream(new FileInputStream(inputFile),100); + InputStream stream = new BufferedInputStream(new FileInputStream(inputFile), 64*1024); try { handler.handleFile(stream); assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", EXPECTED_FAILURES.contains(file)); + assertFalse("Expected to fail for file " + file + " and handler " + handler + ", but did not fail!", + OLD_FILES.contains(file)); } finally { stream.close(); } handler.handleExtracting(inputFile); + } catch (OldWordFileFormatException e) { + // for old word files we should still support extracting text + if(OLD_FILES.contains(file)) { + handler.handleExtracting(inputFile); + } else { + // check if we expect failure for this file + if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) { + throw new Exception("While handling " + file, e); + } + } } catch (Exception e) { // check if we expect failure for this file if(!EXPECTED_FAILURES.contains(file) && !AbstractFileHandler.EXPECTED_EXTRACTOR_FAILURES.contains(file)) { diff --git a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java index 8a27e6d0e9..8819083771 100644 --- a/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/AbstractFileHandler.java @@ -28,8 +28,10 @@ import java.io.InputStream; import java.util.HashSet; import java.util.Set; +import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POITextExtractor; import org.apache.poi.extractor.ExtractorFactory; +import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.xmlbeans.XmlException; @@ -89,6 +91,19 @@ public abstract class AbstractFileHandler implements FileHandler { assertEquals("File should not be modified by extractor", modified, file.lastModified()); handleExtractingAsStream(file); + + if(extractor instanceof POIOLE2TextExtractor) { + HPSFPropertiesExtractor hpsfExtractor = new HPSFPropertiesExtractor((POIOLE2TextExtractor)extractor); + try { + assertNotNull(hpsfExtractor.getDocumentSummaryInformationText()); + assertNotNull(hpsfExtractor.getSummaryInformationText()); + String text = hpsfExtractor.getText(); + //System.out.println(text); + assertNotNull(text); + } finally { + hpsfExtractor.close(); + } + } } catch (IllegalArgumentException e) { if(!EXPECTED_EXTRACTOR_FAILURES.contains(file)) { throw new Exception("While handling " + file, e); diff --git a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java index 3a223674cd..a56ddd2dc6 100644 --- a/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java +++ b/src/integrationtest/org/apache/poi/stress/HWPFFileHandler.java @@ -63,12 +63,10 @@ public class HWPFFileHandler extends POIFSFileHandler { docTextWriter.close(); } - - // a test-case to test this locally without executing the full TestAllFiles @Test public void test() throws Exception { - File file = new File("test-data/document/51921-Word-Crash067.doc"); + File file = new File("test-data/document/52117.doc"); InputStream stream = new FileInputStream(file); try { @@ -91,4 +89,10 @@ public class HWPFFileHandler extends POIFSFileHandler { stream.close(); } } + + @Test + public void testExtractingOld() throws Exception { + File file = new File("test-data/document/52117.doc"); + handleExtracting(file); + } } diff --git a/test-data/document/52117.doc b/test-data/document/52117.doc new file mode 100644 index 0000000000..4f966c01c5 Binary files /dev/null and b/test-data/document/52117.doc differ