mirror of https://github.com/apache/poi.git
Initial ExtractorFactory support for building TextExtractors for embeded documents
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@691351 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b83a13bb2a
commit
e4ff06ec79
|
@ -41,6 +41,7 @@
|
|||
</release>
|
||||
-->
|
||||
<release version="3.5.1-beta2" date="2008-08-20">
|
||||
<action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||
|
|
|
@ -38,6 +38,7 @@
|
|||
</release>
|
||||
-->
|
||||
<release version="3.5.1-beta2" date="2008-08-20">
|
||||
<action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.poi;
|
|||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Common Parent for OLE2 based Text Extractors
|
||||
|
@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||
public POITextExtractor getMetadataTextExtractor() {
|
||||
return new HPSFPropertiesExtractor(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the underlying POIFS FileSystem of
|
||||
* this document.
|
||||
*/
|
||||
public POIFSFileSystem getFileSystem() {
|
||||
return document.filesystem;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString;
|
|||
import org.apache.poi.hssf.usermodel.HSSFRow;
|
||||
import org.apache.poi.hssf.usermodel.HSSFSheet;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
|
@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
this.wb = wb;
|
||||
}
|
||||
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HSSFWorkbook(fs));
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HSSFWorkbook(dir, fs, true));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -18,9 +18,11 @@ package org.apache.poi.extractor;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
|
@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
|||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.xslf.XSLFSlideShow;
|
||||
|
@ -105,24 +109,95 @@ public class ExtractorFactory {
|
|||
}
|
||||
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
||||
return createExtractor(fs.getRoot(), fs);
|
||||
}
|
||||
public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
|
||||
// Look for certain entries in the stream, to figure it
|
||||
// out from
|
||||
for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
|
||||
for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
|
||||
Entry entry = (Entry)entries.next();
|
||||
|
||||
if(entry.getName().equals("Workbook")) {
|
||||
return new ExcelExtractor(fs);
|
||||
return new ExcelExtractor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("WordDocument")) {
|
||||
return new WordExtractor(fs);
|
||||
return new WordExtractor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("PowerPoint Document")) {
|
||||
return new PowerPointExtractor(fs);
|
||||
return new PowerPointExtractor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("VisioDocument")) {
|
||||
return new VisioTextExtractor(fs);
|
||||
return new VisioTextExtractor(poifsDir, fs);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embeded documents in the file (if there are any).
|
||||
* If there are no embeded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embeded file.
|
||||
*/
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
||||
// Find all the embeded directories
|
||||
ArrayList dirs = new ArrayList();
|
||||
POIFSFileSystem fs = ext.getFileSystem();
|
||||
if(fs == null) {
|
||||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||
}
|
||||
|
||||
if(ext instanceof ExcelExtractor) {
|
||||
// These are in MBD... under the root
|
||||
Iterator it = fs.getRoot().getEntries();
|
||||
while(it.hasNext()) {
|
||||
Entry entry = (Entry)it.next();
|
||||
if(entry.getName().startsWith("MBD")) {
|
||||
dirs.add(entry);
|
||||
}
|
||||
}
|
||||
} else if(ext instanceof WordExtractor) {
|
||||
// These are in ObjectPool -> _... under the root
|
||||
try {
|
||||
DirectoryEntry op = (DirectoryEntry)
|
||||
fs.getRoot().getEntry("ObjectPool");
|
||||
Iterator it = op.getEntries();
|
||||
while(it.hasNext()) {
|
||||
Entry entry = (Entry)it.next();
|
||||
if(entry.getName().startsWith("_")) {
|
||||
dirs.add(entry);
|
||||
}
|
||||
}
|
||||
} catch(FileNotFoundException e) {}
|
||||
} else if(ext instanceof PowerPointExtractor) {
|
||||
// Tricky, not stored directly in poifs
|
||||
// TODO
|
||||
}
|
||||
|
||||
// Create the extractors
|
||||
if(dirs == null || dirs.size() == 0) {
|
||||
return new POITextExtractor[0];
|
||||
}
|
||||
|
||||
POITextExtractor[] te = new POITextExtractor[dirs.size()];
|
||||
for(int i=0; i<te.length; i++) {
|
||||
te[i] = createExtractor(
|
||||
(DirectoryNode)dirs.get(i), ext.getFileSystem()
|
||||
);
|
||||
}
|
||||
return te;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embeded documents in the file (if there are any).
|
||||
* If there are no embeded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embeded file.
|
||||
*/
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
|
||||
throw new IllegalStateException("Not yet supported");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
import org.apache.poi.hslf.extractor.PowerPointExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
|
@ -42,6 +44,7 @@ public class TestExtractorFactory extends TestCase {
|
|||
private String word_dir;
|
||||
private String powerpoint_dir;
|
||||
private String visio_dir;
|
||||
private String poifs_dir;
|
||||
|
||||
private File txt;
|
||||
|
||||
|
@ -63,6 +66,12 @@ public class TestExtractorFactory extends TestCase {
|
|||
word_dir = System.getProperty("HWPF.testdata.path");
|
||||
powerpoint_dir = System.getProperty("HSLF.testdata.path");
|
||||
visio_dir = System.getProperty("HDGF.testdata.path");
|
||||
poifs_dir = System.getProperty("POIFS.testdata.path");
|
||||
assertNotNull(excel_dir);
|
||||
assertNotNull(word_dir);
|
||||
assertNotNull(powerpoint_dir);
|
||||
assertNotNull(visio_dir);
|
||||
assertNotNull(poifs_dir);
|
||||
|
||||
txt = new File(powerpoint_dir, "SampleShow.txt");
|
||||
|
||||
|
@ -300,4 +309,56 @@ public class TestExtractorFactory extends TestCase {
|
|||
// Good
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test embeded docs text extraction. For now, only
|
||||
* does poifs embeded, but will do ooxml ones
|
||||
* at some point.
|
||||
*/
|
||||
public void testEmbeded() throws Exception {
|
||||
POIOLE2TextExtractor ext;
|
||||
POITextExtractor[] embeds;
|
||||
File f;
|
||||
|
||||
// No embedings
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(xls);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
assertEquals(0, embeds.length);
|
||||
|
||||
// Excel
|
||||
f = new File(poifs_dir, "excel_with_embeded.xls");
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(f);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
assertEquals(6, embeds.length);
|
||||
assertTrue(embeds[0] instanceof PowerPointExtractor);
|
||||
assertTrue(embeds[1] instanceof ExcelExtractor);
|
||||
assertTrue(embeds[2] instanceof ExcelExtractor);
|
||||
assertTrue(embeds[3] instanceof PowerPointExtractor);
|
||||
assertTrue(embeds[4] instanceof WordExtractor);
|
||||
assertTrue(embeds[5] instanceof WordExtractor);
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
}
|
||||
|
||||
// Word
|
||||
f = new File(poifs_dir, "word_with_embeded.doc");
|
||||
ext = (POIOLE2TextExtractor)
|
||||
ExtractorFactory.createExtractor(f);
|
||||
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
|
||||
|
||||
assertEquals(4, embeds.length);
|
||||
assertTrue(embeds[0] instanceof WordExtractor);
|
||||
assertTrue(embeds[1] instanceof ExcelExtractor);
|
||||
assertTrue(embeds[2] instanceof ExcelExtractor);
|
||||
assertTrue(embeds[3] instanceof PowerPointExtractor);
|
||||
for(int i=0; i<embeds.length; i++) {
|
||||
assertTrue(embeds[i].getText().length() > 20);
|
||||
}
|
||||
|
||||
// TODO - PowerPoint
|
||||
// TODO - Visio
|
||||
}
|
||||
}
|
||||
|
|
|
@ -64,6 +64,10 @@ public class ChunkFactory {
|
|||
private void processChunkParseCommands() throws IOException {
|
||||
String line;
|
||||
InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
|
||||
if(cpd == null) {
|
||||
throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
|
||||
}
|
||||
|
||||
BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
|
||||
while( (line = inp.readLine()) != null ) {
|
||||
if(line.startsWith("#")) continue;
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command;
|
|||
import org.apache.poi.hdgf.streams.ChunkStream;
|
||||
import org.apache.poi.hdgf.streams.PointerContainingStream;
|
||||
import org.apache.poi.hdgf.streams.Stream;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
|
@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor {
|
|||
this.hdgf = hdgf;
|
||||
}
|
||||
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HDGFDiagram(fs));
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HDGFDiagram(dir, fs));
|
||||
this.fs = fs;
|
||||
}
|
||||
public VisioTextExtractor(InputStream inp) throws IOException {
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes;
|
|||
import org.apache.poi.hslf.model.Slide;
|
||||
import org.apache.poi.hslf.model.TextRun;
|
||||
import org.apache.poi.hslf.usermodel.SlideShow;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
|
@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HSLFSlideShow(fs));
|
||||
}
|
||||
public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HSLFSlideShow(dir, fs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a HSLFSlideShow
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece;
|
|||
import org.apache.poi.hwpf.usermodel.HeaderStories;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
|
@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor {
|
|||
this(new HWPFDocument(fs));
|
||||
this.fs = fs;
|
||||
}
|
||||
public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HWPFDocument(dir, fs));
|
||||
this.fs = fs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue