diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index e3cbd0b1a1..f55a853710 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + Text Extraction support for older Word 6 and Word 95 files via HWPF 49508 - Allow the addition of paragraphs to XWPF Table Cells 49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text XSLFSlideShow shouldn't break on .thmx (theme) files. Support for them is still very limited though diff --git a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java index 6a2379e19a..2601f77aea 100644 --- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java +++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java @@ -38,6 +38,8 @@ import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hwpf.OldWordFileFormatException; +import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; @@ -218,7 +220,12 @@ public class ExtractorFactory { } } if(entry.getName().equals("WordDocument")) { - return new WordExtractor(poifsDir, fs); + // Old or new style word document? + try { + return new WordExtractor(poifsDir, fs); + } catch(OldWordFileFormatException e) { + return new Word6Extractor(poifsDir, fs); + } } if(entry.getName().equals("PowerPoint Document")) { return new PowerPointExtractor(poifsDir, fs); @@ -230,12 +237,12 @@ public class ExtractorFactory { return new PublisherTextExtractor(poifsDir, fs); } if( - entry.getName().equals("__substg1.0_1000001E") || - entry.getName().equals("__substg1.0_1000001F") || - entry.getName().equals("__substg1.0_0047001E") || - entry.getName().equals("__substg1.0_0047001F") || - entry.getName().equals("__substg1.0_0037001E") || - entry.getName().equals("__substg1.0_0037001F") + entry.getName().equals("__substg1.0_1000001E") || + entry.getName().equals("__substg1.0_1000001F") || + entry.getName().equals("__substg1.0_0047001E") || + entry.getName().equals("__substg1.0_0047001F") || + entry.getName().equals("__substg1.0_0037001E") || + entry.getName().equals("__substg1.0_0037001F") ) { return new OutlookTextExtactor(poifsDir, fs); } diff --git a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java index f4f178f227..0e4edeef9f 100644 --- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java +++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java @@ -29,6 +29,7 @@ import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hsmf.extractor.OutlookTextExtactor; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; @@ -54,6 +55,8 @@ public class TestExtractorFactory extends TestCase { private File xlsEmb; private File doc; + private File doc6; + private File doc95; private File docx; private File dotx; private File docEmb; @@ -79,6 +82,8 @@ public class TestExtractorFactory extends TestCase { POIDataSamples wpTests = POIDataSamples.getDocumentInstance(); doc = wpTests.getFile("SampleDoc.doc"); + doc6 = wpTests.getFile("Word6.doc"); + doc95 = wpTests.getFile("Word95.doc"); docx = wpTests.getFile("SampleDoc.docx"); dotx = wpTests.getFile("test.dotx"); docEmb = wpTests.getFile("word_with_embeded.doc"); @@ -135,6 +140,23 @@ public class TestExtractorFactory extends TestCase { ExtractorFactory.createExtractor(doc).getText().length() > 120 ); + assertTrue( + ExtractorFactory.createExtractor(doc6) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc6).getText().length() > 20 + ); + + assertTrue( + ExtractorFactory.createExtractor(doc95) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(doc95).getText().length() > 120 + ); + + assertTrue( ExtractorFactory.createExtractor(docx) instanceof XWPFWordExtractor @@ -231,6 +253,22 @@ public class TestExtractorFactory extends TestCase { ExtractorFactory.createExtractor(new FileInputStream(doc)).getText().length() > 120 ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc6)) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc6)).getText().length() > 20 + ); + + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc95)) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(new FileInputStream(doc95)).getText().length() > 120 + ); + assertTrue( ExtractorFactory.createExtractor(new FileInputStream(docx)) instanceof XWPFWordExtractor @@ -311,6 +349,22 @@ public class TestExtractorFactory extends TestCase { ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120 ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20 + ); + + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))) + instanceof Word6Extractor + ); + assertTrue( + ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120 + ); + // PowerPoint assertTrue( ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))) diff --git a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java index bd31f6253d..00d1162e90 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java @@ -169,7 +169,7 @@ public final class HWPFDocument extends HWPFDocumentCore // Is this document too old for us? if(_fib.getNFib() < 106) { - throw new OldWordFileFormatException("The document is too old (Word 95 or older) "); + throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?"); } // use the fib to determine the name of the table stream.