Initial ExtractorFactory support for building TextExtractors for embeded documents

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@691351 13f79535-47bb-0310-9956-ffa450edef68
2008-09-02 19:37:52 +00:00 · 2008-09-02 19:37:52 +00:00 · e4ff06ec79
parent b83a13bb2a
commit e4ff06ec79
11 changed files with 175 additions and 7 deletions
--- a/src/documentation/content/xdocs/changes.xml
+++ b/src/documentation/content/xdocs/changes.xml
@ -41,6 +41,7 @@
        </release>
 -->
        <release version="3.5.1-beta2" date="2008-08-20">
           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
           <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
           <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -38,6 +38,7 @@
        </release>
 -->
        <release version="3.5.1-beta2" date="2008-08-20">
           <action dev="POI-DEVELOPERS" type="add">Initial ExtractorFactory support for building TextExtractors for embeded documents</action>
           <action dev="POI-DEVELOPERS" type="add">Support stripping XSSF header and footer fields (eg page number) out of header and footer text if required</action>
           <action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
           <action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
--- a/src/java/org/apache/poi/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java
@ -19,6 +19,7 @@ package org.apache.poi;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 /**
 * Common Parent for OLE2 based Text Extractors
@ -59,4 +60,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
 	public POITextExtractor getMetadataTextExtractor() {
 		return new HPSFPropertiesExtractor(this);
 	}
 	/**
 	 * Return the underlying POIFS FileSystem of
 	 *  this document.
 	 */
 	public POIFSFileSystem getFileSystem() {
 		return document.filesystem;
 	}
 }
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@ -26,6 +26,7 @@ import org.apache.poi.hssf.usermodel.HSSFRichTextString;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 /**
@ -48,7 +49,10 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 		this.wb = wb;
 	}
 	public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-		this(new HSSFWorkbook(fs));
+		this(fs.getRoot(), fs);
 	}
 	public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
 		this(new HSSFWorkbook(dir, fs, true));
 	}
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@ -18,9 +18,11 @@ package org.apache.poi.extractor;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 import org.apache.poi.POIOLE2TextExtractor;
@ -31,6 +33,8 @@ import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.xslf.XSLFSlideShow;
@ -105,24 +109,95 @@ public class ExtractorFactory {
 	}
 	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
 		return createExtractor(fs.getRoot(), fs);
 	}
 	public static POIOLE2TextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException {
 		// Look for certain entries in the stream, to figure it
 		//  out from
-		for(Iterator entries = fs.getRoot().getEntries(); entries.hasNext(); ) {
+		for(Iterator entries = poifsDir.getEntries(); entries.hasNext(); ) {
 			Entry entry = (Entry)entries.next();
 			if(entry.getName().equals("Workbook")) {
-				return new ExcelExtractor(fs);
+				return new ExcelExtractor(poifsDir, fs);
 			}
 			if(entry.getName().equals("WordDocument")) {
-				return new WordExtractor(fs);
+				return new WordExtractor(poifsDir, fs);
 			}
 			if(entry.getName().equals("PowerPoint Document")) {
-				return new PowerPointExtractor(fs);
+				return new PowerPointExtractor(poifsDir, fs);
 			}
 			if(entry.getName().equals("VisioDocument")) {
-				return new VisioTextExtractor(fs);
+				return new VisioTextExtractor(poifsDir, fs);
 			}
 		}
 		throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
 	}
 	/**
 	 * Returns an array of text extractors, one for each of
 	 *  the embeded documents in the file (if there are any).
 	 * If there are no embeded documents, you'll get back an
 	 *  empty array. Otherwise, you'll get one open 
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
 	public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
 		// Find all the embeded directories
 		ArrayList dirs = new ArrayList();
 		POIFSFileSystem fs = ext.getFileSystem();
 		if(fs == null) {
 			throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
 		}
 		if(ext instanceof ExcelExtractor) {
 			// These are in MBD... under the root
 			Iterator it = fs.getRoot().getEntries();
 			while(it.hasNext()) {
 				Entry entry = (Entry)it.next();
 				if(entry.getName().startsWith("MBD")) {
 					dirs.add(entry);
 				}
 			}
 		} else if(ext instanceof WordExtractor) {
 			// These are in ObjectPool -> _... under the root
 			try {
 				DirectoryEntry op = (DirectoryEntry)
 					fs.getRoot().getEntry("ObjectPool");
 				Iterator it = op.getEntries();
 				while(it.hasNext()) {
 					Entry entry = (Entry)it.next();
 					if(entry.getName().startsWith("_")) {
 						dirs.add(entry);
 					}
 				}
 			} catch(FileNotFoundException e) {}
 		} else if(ext instanceof PowerPointExtractor) {
 			// Tricky, not stored directly in poifs
 			// TODO
 		}
 		// Create the extractors
 		if(dirs == null || dirs.size() == 0) {
 			return new POITextExtractor[0];
 		}
 		POITextExtractor[] te = new POITextExtractor[dirs.size()];
 		for(int i=0; i<te.length; i++) {
 			te[i] = createExtractor(
 					(DirectoryNode)dirs.get(i), ext.getFileSystem()
 			);
 		}
 		return te;
 	}
 	/**
 	 * Returns an array of text extractors, one for each of
 	 *  the embeded documents in the file (if there are any).
 	 * If there are no embeded documents, you'll get back an
 	 *  empty array. Otherwise, you'll get one open 
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
 	public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
 		throw new IllegalStateException("Not yet supported");
 	}
 }
--- a/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
+++ b/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
@ -20,6 +20,8 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
@ -42,6 +44,7 @@ public class TestExtractorFactory extends TestCase {
 	private String word_dir;
 	private String powerpoint_dir;
 	private String visio_dir;
 	private String poifs_dir;
 	private File txt;
@ -63,6 +66,12 @@ public class TestExtractorFactory extends TestCase {
 		word_dir = System.getProperty("HWPF.testdata.path");
 		powerpoint_dir = System.getProperty("HSLF.testdata.path");
 		visio_dir = System.getProperty("HDGF.testdata.path");
 		poifs_dir = System.getProperty("POIFS.testdata.path");
 		assertNotNull(excel_dir);
 		assertNotNull(word_dir);
 		assertNotNull(powerpoint_dir);
 		assertNotNull(visio_dir);
 		assertNotNull(poifs_dir);
 		txt = new File(powerpoint_dir, "SampleShow.txt");
@ -300,4 +309,56 @@ public class TestExtractorFactory extends TestCase {
 			// Good
 		}
 	}
 	/**
 	 * Test embeded docs text extraction. For now, only
 	 *  does poifs embeded, but will do ooxml ones 
 	 *  at some point.
 	 */
 	public void testEmbeded() throws Exception {
 		POIOLE2TextExtractor ext;
 		POITextExtractor[] embeds;
 		File f;
 		// No embedings
 		ext = (POIOLE2TextExtractor)
 				ExtractorFactory.createExtractor(xls);
 		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
 		assertEquals(0, embeds.length);
 		// Excel
 		f = new File(poifs_dir, "excel_with_embeded.xls");
 		ext = (POIOLE2TextExtractor)
 				ExtractorFactory.createExtractor(f);
 		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
 		assertEquals(6, embeds.length);
 		assertTrue(embeds[0] instanceof PowerPointExtractor);
 		assertTrue(embeds[1] instanceof ExcelExtractor);
 		assertTrue(embeds[2] instanceof ExcelExtractor);
 		assertTrue(embeds[3] instanceof PowerPointExtractor);
 		assertTrue(embeds[4] instanceof WordExtractor);
 		assertTrue(embeds[5] instanceof WordExtractor);
 		for(int i=0; i<embeds.length; i++) {
 			assertTrue(embeds[i].getText().length() > 20);
 		}
 		// Word
 		f = new File(poifs_dir, "word_with_embeded.doc");
 		ext = (POIOLE2TextExtractor)
 				ExtractorFactory.createExtractor(f);
 		embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
 		assertEquals(4, embeds.length);
 		assertTrue(embeds[0] instanceof WordExtractor);
 		assertTrue(embeds[1] instanceof ExcelExtractor);
 		assertTrue(embeds[2] instanceof ExcelExtractor);
 		assertTrue(embeds[3] instanceof PowerPointExtractor);
 		for(int i=0; i<embeds.length; i++) {
 			assertTrue(embeds[i].getText().length() > 20);
 		}
 		// TODO - PowerPoint
 		// TODO - Visio
 	}
 }
--- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkFactory.java
@ -64,6 +64,10 @@ public class ChunkFactory {
 	private void processChunkParseCommands() throws IOException {
 		String line;
 		InputStream cpd = ChunkFactory.class.getResourceAsStream(chunkTableName);
 		if(cpd == null) {
 			throw new IllegalStateException("Unable to find HDGF chunk definition on the classpath - " + chunkTableName);
 		}
 		BufferedReader inp = new BufferedReader(new InputStreamReader(cpd));
 		while( (line = inp.readLine()) != null ) {
 			if(line.startsWith("#")) continue;
--- a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
@ -28,6 +28,7 @@ import org.apache.poi.hdgf.chunks.Chunk.Command;
 import org.apache.poi.hdgf.streams.ChunkStream;
 import org.apache.poi.hdgf.streams.PointerContainingStream;
 import org.apache.poi.hdgf.streams.Stream;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 /**
@ -44,7 +45,10 @@ public class VisioTextExtractor extends POIOLE2TextExtractor {
 		this.hdgf = hdgf;
 	}
 	public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
-		this(new HDGFDiagram(fs));
+		this(fs.getRoot(), fs);
 	}
 	public VisioTextExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
 		this(new HDGFDiagram(dir, fs));
 		this.fs = fs;
 	}
 	public VisioTextExtractor(InputStream inp) throws IOException {
--- a/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hslf/extractor/PowerPointExtractor.java
@ -30,6 +30,7 @@ import org.apache.poi.hslf.model.Notes;
 import org.apache.poi.hslf.model.Slide;
 import org.apache.poi.hslf.model.TextRun;
 import org.apache.poi.hslf.usermodel.SlideShow;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 /**
@ -96,6 +97,9 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
 	public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
 		this(new HSLFSlideShow(fs));
 	}
 	public PowerPointExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
 		this(new HSLFSlideShow(dir, fs));
 	}
 	/**
 	 * Creates a PowerPointExtractor, from a HSLFSlideShow
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -28,6 +28,7 @@ import org.apache.poi.hwpf.model.TextPiece;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 /**
@ -58,6 +59,10 @@ public class WordExtractor extends POIOLE2TextExtractor {
 		this(new HWPFDocument(fs));
 		this.fs = fs;
 	}
 	public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
 		this(new HWPFDocument(dir, fs));
 		this.fs = fs;
 	}
 	/**
 	 * Create a new Word Extractor
--- a/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx
+++ b/src/testcases/org/apache/poi/hssf/data/WithEmbeded.xlsx