Initial ExtractorFactory support for building TextExtractors for embeded documents

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@691351 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-09-02 19:37:52 +00:00
parent b83a13bb2a
commit e4ff06ec79
11 changed files with 175 additions and 7 deletions

View File

@ -41,6 +41,7 @@
</release>
-->
<release version="3.5.1-beta2" date="2008-08-20">
<action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>

View File

@ -38,6 +38,7 @@
</release>
-->
<release version="3.5.1-beta2" date="2008-08-20">
<action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
<action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>

View File

@ -19,6 +19,7 @@ package org.apache.poi;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Common Parent for OLE2 based Text Extractors
@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
public POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
/**
* Return the underlying POIFS FileSystem of
* this document.
*/
public POIFSFileSystem getFileSystem() {
return document.filesystem;
}
}

View File

@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
this.wb = wb;
}
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
this(new HSSFWorkbook(fs));
this(fs.getRoot(), fs);
}
public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HSSFWorkbook(dir, fs, true));
}

View File

@ -18,9 +18,11 @@ package org.apache.poi.extractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.poi.POIOLE2TextExtractor;
@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xslf.XSLFSlideShow;
@ -105,24 +109,95 @@ public class ExtractorFactory {
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
return createExtractor(fs.getRoot(), fs);
}
public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
// Look for certain entries in the stream, to figure it
// out from
for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
Entry entry = (Entry)entries.next();
if(entry.getName().equals("Workbook")) {
return new ExcelExtractor(fs);
return new ExcelExtractor(poifsDir, fs);
}
if(entry.getName().equals("WordDocument")) {
return new WordExtractor(fs);
return new WordExtractor(poifsDir, fs);
}
if(entry.getName().equals("PowerPoint Document")) {
return new PowerPointExtractor(fs);
return new PowerPointExtractor(poifsDir, fs);
}
if(entry.getName().equals("VisioDocument")) {
return new VisioTextExtractor(fs);
return new VisioTextExtractor(poifsDir, fs);
}
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
/**
* Returns an array of text extractors, one for each of
* the embeded documents in the file (if there are any).
* If there are no embeded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
// Find all the embeded directories
ArrayList dirs = new ArrayList();
POIFSFileSystem fs = ext.getFileSystem();
if(fs == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if(ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator it = fs.getRoot().getEntries();
while(it.hasNext()) {
Entry entry = (Entry)it.next();
if(entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else if(ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry)
fs.getRoot().getEntry("ObjectPool");
Iterator it = op.getEntries();
while(it.hasNext()) {
Entry entry = (Entry)it.next();
if(entry.getName().startsWith("_")) {
dirs.add(entry);
}
}
} catch(FileNotFoundException e) {}
} else if(ext instanceof PowerPointExtractor) {
// Tricky, not stored directly in poifs
// TODO
}
// Create the extractors
if(dirs == null || dirs.size() == 0) {
return new POITextExtractor[0];
}
POITextExtractor[] te = new POITextExtractor[dirs.size()];
for(int i=0; i<te.length; i++) {
te[i] = createExtractor(
(DirectoryNode)dirs.get(i), ext.getFileSystem()
);
}
return te;
}
/**
* Returns an array of text extractors, one for each of
* the embeded documents in the file (if there are any).
* If there are no embeded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
throw new IllegalStateException("Not yet supported");
}
}

View File

@ -20,6 +20,8 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
@ -42,6 +44,7 @@ public class TestExtractorFactory extends TestCase {
private String word_dir;
private String powerpoint_dir;
private String visio_dir;
private String poifs_dir;
private File txt;
@ -63,6 +66,12 @@ public class TestExtractorFactory extends TestCase {
word_dir = System.getProperty("HWPF.testdata.path");
powerpoint_dir = System.getProperty("HSLF.testdata.path");
visio_dir = System.getProperty("HDGF.testdata.path");
poifs_dir = System.getProperty("POIFS.testdata.path");
assertNotNull(excel_dir);
assertNotNull(word_dir);
assertNotNull(powerpoint_dir);
assertNotNull(visio_dir);
assertNotNull(poifs_dir);
txt = new File(powerpoint_dir, "SampleShow.txt");
@ -300,4 +309,56 @@ public class TestExtractorFactory extends TestCase {
// Good
}
}
/**
* Test embeded docs text extraction. For now, only
* does poifs embeded, but will do ooxml ones
* at some point.
*/
public void testEmbeded() throws Exception {
POIOLE2TextExtractor ext;
POITextExtractor[] embeds;
File f;
// No embedings
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
// Excel
f = new File(poifs_dir, "excel_with_embeded.xls");
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(f);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(6, embeds.length);
assertTrue(embeds[0] instanceof PowerPointExtractor);
assertTrue(embeds[1] instanceof ExcelExtractor);
assertTrue(embeds[2] instanceof ExcelExtractor);
assertTrue(embeds[3] instanceof PowerPointExtractor);
assertTrue(embeds[4] instanceof WordExtractor);
assertTrue(embeds[5] instanceof WordExtractor);
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
}
// Word
f = new File(poifs_dir, "word_with_embeded.doc");
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(f);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(4, embeds.length);
assertTrue(embeds[0] instanceof WordExtractor);
assertTrue(embeds[1] instanceof ExcelExtractor);
assertTrue(embeds[2] instanceof ExcelExtractor);
assertTrue(embeds[3] instanceof PowerPointExtractor);
for(int i=0; i<embeds.length; i++) {
assertTrue(embeds[i].getText().length() > 20);
}
// TODO - PowerPoint
// TODO - Visio
}
}

View File

@ -64,6 +64,10 @@ public class ChunkFactory {
private void processChunkParseCommands() throws IOException {
String line;
InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
if(cpd == null) {
throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
}
BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
while( (line = inp.readLine()) != null ) {
if(line.startsWith("#")) continue;

View File

@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command;
import org.apache.poi.hdgf.streams.ChunkStream;
import org.apache.poi.hdgf.streams.PointerContainingStream;
import org.apache.poi.hdgf.streams.Stream;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor {
this.hdgf = hdgf;
}
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
this(new HDGFDiagram(fs));
this(fs.getRoot(), fs);
}
public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HDGFDiagram(dir, fs));
this.fs = fs;
}
public VisioTextExtractor(InputStream inp) throws IOException {

View File

@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
this(new HSLFSlideShow(fs));
}
public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HSLFSlideShow(dir, fs));
}
/**
* Creates a PowerPointExtractor, from a HSLFSlideShow

View File

@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor {
this(new HWPFDocument(fs));
this.fs = fs;
}
public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
this(new HWPFDocument(dir, fs));
this.fs = fs;
}
/**
* Create a new Word Extractor

Binary file not shown.