Initial ExtractorFactory support for building TextExtractors for embeded documents

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@691351 13f79535-47bb-0310-9956-ffa450edef68
2025-02-22 18:17:41 +00:00 · 2008-09-02 19:37:52 +00:00 · 2008-09-02 19:37:52 +00:00 · e4ff06ec79
commit e4ff06ec79
parent b83a13bb2a
11 changed files with 175 additions and 7 deletions
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@ -41,6 +41,7 @@
        </release>
 -->
        <release version="3.5.1-beta2" date="2008-08-20">
+           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
           <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
           <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -38,6 +38,7 @@
        </release>
 -->
        <release version="3.5.1-beta2" date="2008-08-20">
+           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
           <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
           <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
--- a/src/java/org/apache/poi/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java
@ -19,6 +19,7 @@ package org.apache.poi;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
 * Common Parent for OLE2 based Text Extractors
@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
 	public POITextExtractor getMetadataTextExtractor() {
 		return new HPSFPropertiesExtractor(this);
 	}
+
+	/**
+	 * Return the underlying POIFS FileSystem of
+	 *  this document.
+	 */
+	public POIFSFileSystem getFileSystem() {
+		return document.filesystem;
+	}
 }
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 		this.wb = wb;
 	}
 	public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-		this(new HSSFWorkbook(fs));
+		this(fs.getRoot(), fs);
+	}
+	public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+		this(new HSSFWorkbook(dir, fs, true));
 	}
 	

--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@ -18,9 +18,11 @@ package org.apache.poi.extractor;

 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
+import java.util.ArrayList;
 import java.util.Iterator;

 import org.apache.poi.POIOLE2TextExtractor;
@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.XSLFSlideShow;
@ -105,24 +109,95 @@ public class ExtractorFactory {
 	}
 	
 	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
+		return createExtractor(fs.getRoot(), fs);
+	}
+	public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
 		// Look for certain entries in the stream, to figure it
 		//  out from
-		for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
+		for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
 			Entry entry = (Entry)entries.next();
 			
 			if(entry.getName().equals("Workbook")) {
-				return new ExcelExtractor(fs);
+				return new ExcelExtractor(poifsDir, fs);
 			}
 			if(entry.getName().equals("WordDocument")) {
-				return new WordExtractor(fs);
+				return new WordExtractor(poifsDir, fs);
 			}
 			if(entry.getName().equals("PowerPoint Document")) {
-				return new PowerPointExtractor(fs);
+				return new PowerPointExtractor(poifsDir, fs);
 			}
 			if(entry.getName().equals("VisioDocument")) {
-				return new VisioTextExtractor(fs);
+				return new VisioTextExtractor(poifsDir, fs);
 			}
 		}
 		throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
 	}
+	
+	
+	/**
+	 * Returns an array of text extractors, one for each of
+	 *  the embeded documents in the file (if there are any).
+	 * If there are no embeded documents, you'll get back an
+	 *  empty array. Otherwise, you'll get one open 
+	 *  {@link POITextExtractor} for each embeded file.
+	 */
+	public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
+		// Find all the embeded directories
+		ArrayList dirs = new ArrayList();
+		POIFSFileSystem fs = ext.getFileSystem();
+		if(fs == null) {
+			throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
+		}
+		
+		if(ext instanceof ExcelExtractor) {
+			// These are in MBD... under the root
+			Iterator it = fs.getRoot().getEntries();
+			while(it.hasNext()) {
+				Entry entry = (Entry)it.next();
+				if(entry.getName().startsWith("MBD")) {
+					dirs.add(entry);
+				}
+			}
+		} else if(ext instanceof WordExtractor) {
+			// These are in ObjectPool -> _... under the root
+			try {
+				DirectoryEntry op = (DirectoryEntry)
+					fs.getRoot().getEntry("ObjectPool");
+				Iterator it = op.getEntries();
+				while(it.hasNext()) {
+					Entry entry = (Entry)it.next();
+					if(entry.getName().startsWith("_")) {
+						dirs.add(entry);
+					}
+				}
+			} catch(FileNotFoundException e) {}
+		} else if(ext instanceof PowerPointExtractor) {
+			// Tricky, not stored directly in poifs
+			// TODO
+		}
+		
+		// Create the extractors
+		if(dirs == null || dirs.size() == 0) {
+			return new POITextExtractor[0];
+		}
+		
+		POITextExtractor[] te = new POITextExtractor[dirs.size()];
+		for(int i=0; i<te.length; i++) {
+			te[i] = createExtractor(
+					(DirectoryNode)dirs.get(i), ext.getFileSystem()
+			);
+		}
+		return te;
+	}
+
+	/**
+	 * Returns an array of text extractors, one for each of
+	 *  the embeded documents in the file (if there are any).
+	 * If there are no embeded documents, you'll get back an
+	 *  empty array. Otherwise, you'll get one open 
+	 *  {@link POITextExtractor} for each embeded file.
+	 */
+	public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
+		throw new IllegalStateException("Not yet supported");
+	}
 }
--- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
+++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
@ -20,6 +20,8 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;

+import org.apache.poi.POIOLE2TextExtractor;
+import org.apache.poi.POITextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
@ -42,6 +44,7 @@ public class TestExtractorFactory extends TestCase {
 	private String word_dir;
 	private String powerpoint_dir;
 	private String visio_dir;
+	private String poifs_dir;
 	
 	private File txt;
 	
@ -63,6 +66,12 @@ public class TestExtractorFactory extends TestCase {
 		word_dir = System.getProperty("HWPF.testdata.path");
 		powerpoint_dir = System.getProperty("HSLF.testdata.path");
 		visio_dir = System.getProperty("HDGF.testdata.path");
+		poifs_dir = System.getProperty("POIFS.testdata.path");
+		assertNotNull(excel_dir);
+		assertNotNull(word_dir);
+		assertNotNull(powerpoint_dir);
+		assertNotNull(visio_dir);
+		assertNotNull(poifs_dir);
 		
 		txt = new File(powerpoint_dir, "SampleShow.txt");
 		
@ -300,4 +309,56 @@ public class TestExtractorFactory extends TestCase {
 			// Good
 		}
 	}
+
+	/**
+	 * Test embeded docs text extraction. For now, only
+	 *  does poifs embeded, but will do ooxml ones 
+	 *  at some point.
+	 */
+	public void testEmbeded() throws Exception {
+		POIOLE2TextExtractor ext;
+		POITextExtractor[] embeds;
+		File f;
+		
+		// No embedings
+		ext = (POIOLE2TextExtractor)
+				ExtractorFactory.createExtractor(xls);
+		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+		assertEquals(0, embeds.length);
+		
+		// Excel
+		f = new File(poifs_dir, "excel_with_embeded.xls");
+		ext = (POIOLE2TextExtractor)
+				ExtractorFactory.createExtractor(f);
+		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+		
+		assertEquals(6, embeds.length);
+		assertTrue(embeds[0] instanceof PowerPointExtractor);
+		assertTrue(embeds[1] instanceof ExcelExtractor);
+		assertTrue(embeds[2] instanceof ExcelExtractor);
+		assertTrue(embeds[3] instanceof PowerPointExtractor);
+		assertTrue(embeds[4] instanceof WordExtractor);
+		assertTrue(embeds[5] instanceof WordExtractor);
+		for(int i=0; i<embeds.length; i++) {
+			assertTrue(embeds[i].getText().length() > 20);
+		}
+		
+		// Word
+		f = new File(poifs_dir, "word_with_embeded.doc");
+		ext = (POIOLE2TextExtractor)
+				ExtractorFactory.createExtractor(f);
+		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+		
+		assertEquals(4, embeds.length);
+		assertTrue(embeds[0] instanceof WordExtractor);
+		assertTrue(embeds[1] instanceof ExcelExtractor);
+		assertTrue(embeds[2] instanceof ExcelExtractor);
+		assertTrue(embeds[3] instanceof PowerPointExtractor);
+		for(int i=0; i<embeds.length; i++) {
+			assertTrue(embeds[i].getText().length() > 20);
+		}
+		
+		// TODO - PowerPoint
+		// TODO - Visio
+	}
 }
--- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
@ -64,6 +64,10 @@ public class ChunkFactory {
 	private void processChunkParseCommands() throws IOException {
 		String line;
 		InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
+		if(cpd == null) {
+			throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
+		}
+		
 		BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
 		while( (line = inp.readLine()) != null ) {
 			if(line.startsWith("#")) continue;
--- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command;
 import org.apache.poi.hdgf.streams.ChunkStream;
 import org.apache.poi.hdgf.streams.PointerContainingStream;
 import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor {
 		this.hdgf = hdgf;
 	}
 	public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
-		this(new HDGFDiagram(fs));
+		this(fs.getRoot(), fs);
+	}
+	public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+		this(new HDGFDiagram(dir, fs));
 		this.fs = fs;
 	}
 	public VisioTextExtractor(InputStream inp) throws IOException {
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes;
 import org.apache.poi.hslf.model.Slide;
 import org.apache.poi.hslf.model.TextRun;
 import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
 	public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
 		this(new HSLFSlideShow(fs));
 	}
+	public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+		this(new HSLFSlideShow(dir, fs));
+	}

 	/**
 	 * Creates a PowerPointExtractor, from a HSLFSlideShow
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor {
 		this(new HWPFDocument(fs));
 		this.fs = fs;
 	}
+	public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+		this(new HWPFDocument(dir, fs));
+		this.fs = fs;
+	}
 	
 	/**
 	 * Create a new Word Extractor
--- a/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx
+++ b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx