Add Word-to-Text converter and use it as replacement for WordExtractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155336 13f79535-47bb-0310-9956-ffa450edef68
2011-08-09 12:38:52 +00:00 · 2011-08-09 12:38:52 +00:00 · 49697de696
parent 888f51c566
commit 49697de696
26 changed files with 1117 additions and 488 deletions
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@ -34,6 +34,7 @@

    <changes>
        <release version="3.8-beta4" date="2011-??-??">
+           <action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
           <action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
           <action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
           <action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>
--- a/src/java/org/apache/poi/POIOLE2TextExtractor.java
+++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java
@ -19,6 +19,7 @@ package org.apache.poi;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
@ -39,7 +40,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
 	public POIOLE2TextExtractor(POIDocument document) {
 		super(document);
 	}
-	
+
 	/**
 	 * Returns the document information metadata for the document
 	 */
@ -52,20 +53,28 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
 	public SummaryInformation getSummaryInformation() {
 		return document.getSummaryInformation();
 	}
-	
+
 	/**
-	 * Returns an HPSF powered text extractor for the 
+	 * Returns an HPSF powered text extractor for the
 	 *  document properties metadata, such as title and author.
 	 */
 	public POITextExtractor getMetadataTextExtractor() {
 		return new HPSFPropertiesExtractor(this);
 	}

-	/**
-	 * Return the underlying POIFS FileSystem of
-	 *  this document.
-	 */
-	public POIFSFileSystem getFileSystem() {
-		return document.directory.getFileSystem();
-	}
+    public DirectoryEntry getRoot()
+    {
+        return document.directory;
+    }
+
+    /**
+     * Return the underlying POIFS FileSystem of this document.
+     *
+     * @deprecated Use {@link #getRoot()} instead
+     */
+    @Deprecated
+    public POIFSFileSystem getFileSystem()
+    {
+        return document.directory.getFileSystem();
+    }
 }
--- a/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/EventBasedExcelExtractor.java
@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 */
 public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
   private DirectoryNode _dir;
-	private POIFSFileSystem _fs;
 	boolean _includeSheetNames = true;
 	boolean _formulasNotResults = false;

-	public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
-		super(null);
-		_dir = dir;
-		_fs = fs;
-	}
+    /**
+     * @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
+    {
+        this( dir );
+    }
+
+    public EventBasedExcelExtractor( DirectoryNode dir )
+    {
+        super( null );
+        _dir = dir;
+    }
+
   public EventBasedExcelExtractor(POIFSFileSystem fs) {
-      this(fs.getRoot(), fs);
+      this(fs.getRoot());
   }

   /**
@ -79,9 +89,9 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
    *  this document.
    */
   public POIFSFileSystem getFileSystem() {
-      return _fs;
+      return _dir.getFileSystem();
   }
-   
+
 	/**
 	 * Would return the document information metadata for the document,
 	 *  if we supported it
@ -200,7 +210,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
 						outputNextStringValue = true;
 						nextRow = frec.getRow();
 					} else {
-						thisText = _ft.formatNumberDateCell(frec); 
+						thisText = _ft.formatNumberDateCell(frec);
 					}
 				}
 				break;
@ -234,7 +244,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
 			case NumberRecord.sid:
 				NumberRecord numrec = (NumberRecord) record;
 				thisRow = numrec.getRow();
-				thisText = _ft.formatNumberDateCell(numrec); 
+				thisText = _ft.formatNumberDateCell(numrec);
 				break;
 			default:
 				break;
--- a/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
+++ b/src/java/org/apache/poi/hssf/extractor/ExcelExtractor.java
@ -24,7 +24,6 @@ import java.io.InputStream;
 import java.io.PrintStream;

 import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFCellStyle;
 import org.apache.poi.hssf.usermodel.HSSFComment;
@ -35,12 +34,13 @@ import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.formula.eval.ErrorEval;
 import org.apache.poi.ss.usermodel.HeaderFooter;

 /**
 * A text extractor for Excel files.
 * <p>
- * Returns the textual content of the file, suitable for 
+ * Returns the textual content of the file, suitable for
 *  indexing by something like Lucene, but not really
 *  intended for display to the user.
 * </p>
@ -59,19 +59,27 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 	private boolean _includeCellComments = false;
 	private boolean _includeBlankCells = false;
 	private boolean _includeHeadersFooters = true;
-	
+
 	public ExcelExtractor(HSSFWorkbook wb) {
 		super(wb);
 		_wb = wb;
 		_formatter = new HSSFDataFormatter();
 	}
 	public ExcelExtractor(POIFSFileSystem fs) throws IOException {
-		this(fs.getRoot(), fs);
+		this(fs.getRoot());
 	}
-	public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-		this(new HSSFWorkbook(dir, fs, true));
+	/**
+     * @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+        this( dir );
+    }
+    public ExcelExtractor(DirectoryNode dir) throws IOException {
+		this(new HSSFWorkbook(dir, true));
 	}
-	
+
 	private static final class CommandParseException extends Exception {
 		public CommandParseException(String msg) {
 			super(msg);
@ -183,7 +191,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 			return _headersFooters;
 		}
 	}
-	
+
 	private static void printUsageMessage(PrintStream ps) {
 		ps.println("Use:");
 		ps.println("    " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
@ -201,7 +209,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 	 * Command line extractor.
 	 */
 	public static void main(String[] args) {
-		
+
 		CommandArgs cmdArgs;
 		try {
 			cmdArgs = new CommandArgs(args);
@ -211,12 +219,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 			System.exit(1);
 			return; // suppress compiler error
 		}
-		
+
 		if (cmdArgs.isRequestHelp()) {
 			printUsageMessage(System.out);
 			return;
 		}
-		
+
 		try {
 			InputStream is;
 			if(cmdArgs.getInputFile() == null) {
@ -270,9 +278,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 	 * Default is to include them.
 	 */
 	public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
-		_includeHeadersFooters = includeHeadersFooters; 
+		_includeHeadersFooters = includeHeadersFooters;
 	}
-	
+
 	/**
 	 * Retrieves the text contents of the file
 	 */
@ -282,12 +290,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 		// We don't care about the difference between
 		//  null (missing) and blank cells
 		_wb.setMissingCellPolicy(HSSFRow.RETURN_BLANK_AS_NULL);
-		
+
 		// Process each sheet in turn
 		for(int i=0;i<_wb.getNumberOfSheets();i++) {
 			HSSFSheet sheet = _wb.getSheetAt(i);
 			if(sheet == null) { continue; }
-			
+
 			if(_includeSheetNames) {
 				String name = _wb.getSheetName(i);
 				if(name != null) {
@ -295,12 +303,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 					text.append("\n");
 				}
 			}
-			
+
 			// Header text, if there is any
 			if(_includeHeadersFooters) {
 				text.append(_extractHeaderFooter(sheet.getHeader()));
 			}
-			
+
 			int firstRow = sheet.getFirstRowNum();
 			int lastRow = sheet.getLastRowNum();
 			for(int j=firstRow;j<=lastRow;j++) {
@ -313,7 +321,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 				if(_includeBlankCells) {
 					firstCell = 0;
 				}
-				
+
 				for(int k=firstCell;k<lastCell;k++) {
 					HSSFCell cell = row.getCell(k);
 					boolean outputContents = true;
@ -368,14 +376,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 										case HSSFCell.CELL_TYPE_ERROR:
 											text.append(ErrorEval.getText(cell.getErrorCellValue()));
 											break;
-											
+
 									}
 								}
 								break;
 							default:
 								throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
 						}
-						
+
 						// Output the comment, if requested and exists
 						HSSFComment comment = cell.getCellComment();
 						if(_includeCellComments && comment != null) {
@ -385,29 +393,29 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 							text.append(" Comment by "+comment.getAuthor()+": "+commentText);
 						}
 					}
-					
+
 					// Output a tab if we're not on the last cell
 					if(outputContents && k < (lastCell-1)) {
 						text.append("\t");
 					}
 				}
-				
+
 				// Finish off the row
 				text.append("\n");
 			}
-			
+
 			// Finally Footer text, if there is any
 			if(_includeHeadersFooters) {
 				text.append(_extractHeaderFooter(sheet.getFooter()));
 			}
 		}
-		
+
 		return text.toString();
 	}
-	
+
 	public static String _extractHeaderFooter(HeaderFooter hf) {
 		StringBuffer text = new StringBuffer();
-		
+
 		if(hf.getLeft() != null) {
 			text.append(hf.getLeft());
 		}
@ -423,7 +431,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
 		}
 		if(text.length() > 0)
 			text.append("\n");
-		
+
 		return text.toString();
 	}
 }
--- a/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
+++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryEntry.java
@ -15,13 +15,14 @@
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
-        
+

 package org.apache.poi.poifs.filesystem;

-import java.io.*;
-
-import java.util.*;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;

 import org.apache.poi.hpsf.ClassID;

@ -67,6 +68,12 @@ public interface DirectoryEntry

    public int getEntryCount();

+    /**
+     * Checks if entry with specified name present
+     */
+
+    public boolean hasEntry( final String name );
+
    /**
     * get a specified Entry by name
     *
--- a/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
+++ b/src/java/org/apache/poi/poifs/filesystem/DirectoryNode.java
@ -15,7 +15,7 @@
   See the License for the specific language governing permissions and
   limitations under the License.
 ==================================================================== */
-        
+

 package org.apache.poi.poifs.filesystem;

@ -53,7 +53,7 @@ public class DirectoryNode
    // the POIFSFileSystem we belong to
    private POIFSFileSystem   _ofilesystem;
    // the NPOIFSFileSytem we belong to
-    private NPOIFSFileSystem  _nfilesystem; 
+    private NPOIFSFileSystem  _nfilesystem;

    // the path described by this document
    private POIFSDocumentPath _path;
@ -72,7 +72,7 @@ public class DirectoryNode
    {
       this(property, parent, filesystem, (NPOIFSFileSystem)null);
    }
-    
+
    /**
     * create a DirectoryNode. This method is not public by design; it
     * is intended strictly for the internal use of this package
@ -87,7 +87,7 @@ public class DirectoryNode
    {
       this(property, parent, (POIFSFileSystem)null, nfilesystem);
    }
-    
+
    private DirectoryNode(final DirectoryProperty property,
                          final DirectoryNode parent,
                          final POIFSFileSystem ofilesystem,
@ -96,7 +96,7 @@ public class DirectoryNode
        super(property, parent);
        this._ofilesystem = ofilesystem;
        this._nfilesystem = nfilesystem;
-        
+
        if (parent == null)
        {
            _path = new POIFSDocumentPath();
@ -143,23 +143,23 @@ public class DirectoryNode
    {
        return _path;
    }
-    
+
    /**
     * @return the filesystem that this belongs to
     */
    public POIFSFileSystem getFileSystem()
    {
-        return _ofilesystem; 
+        return _ofilesystem;
    }
-    
+
    /**
     * @return the filesystem that this belongs to
     */
    public NPOIFSFileSystem getNFileSystem()
    {
-        return _nfilesystem; 
+        return _nfilesystem;
    }
-    
+
    /**
     * open a document in the directory's entry's list of entries
     *
@ -195,7 +195,7 @@ public class DirectoryNode
            throw new IOException("Entry '" + document.getName()
                                  + "' is not a DocumentEntry");
        }
-        
+
        DocumentEntry entry = (DocumentEntry)document;
        return new DocumentInputStream(entry);
    }
@ -217,7 +217,7 @@ public class DirectoryNode

        (( DirectoryProperty ) getProperty()).addChild(property);
        _ofilesystem.addDocument(document);
-        
+
        _entries.add(rval);
        _byname.put(property.getName(), rval);
        return rval;
@ -240,7 +240,7 @@ public class DirectoryNode

        (( DirectoryProperty ) getProperty()).addChild(property);
        _nfilesystem.addDocument(document);
-        
+
        _entries.add(rval);
        _byname.put(property.getName(), rval);
        return rval;
@ -290,7 +290,7 @@ public class DirectoryNode
        {
            _entries.remove(entry);
        	   _byname.remove(entry.getName());
-        	   
+
        	   if(_ofilesystem != null) {
               _ofilesystem.remove(entry);
        	   } else {
@ -342,6 +342,11 @@ public class DirectoryNode
        return _entries.size();
    }

+    public boolean hasEntry( String name )
+    {
+        return name != null && _byname.containsKey( name );
+    }
+
    /**
     * get a specified Entry by name
     *
@ -430,7 +435,7 @@ public class DirectoryNode
    {
        DirectoryNode rval;
        DirectoryProperty property = new DirectoryProperty(name);
-        
+
        if(_ofilesystem != null) {
           rval = new DirectoryNode(property, _ofilesystem, this);
           _ofilesystem.addDirectory(property);
@ -562,7 +567,7 @@ public class DirectoryNode
     * Returns an Iterator over all the entries
     */
    public Iterator<Entry> iterator() {
-        return getEntries(); 
+        return getEntries();
    }

    /* **********  END  begin implementation of POIFSViewable ********** */
--- a/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
+++ b/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
@ -66,48 +66,48 @@ import org.apache.xmlbeans.XmlException;
 public class ExtractorFactory {
 	public static final String CORE_DOCUMENT_REL =
 		"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
-	
-	
+
+
 	/** Should this thread prefer event based over usermodel based extractors? */
 	private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
      protected Boolean initialValue() { return Boolean.FALSE; }
 	};
 	/** Should all threads prefer event based over usermodel based extractors? */
 	private static Boolean allPreferEventExtractors;
-	
-   /** 
+
+   /**
    * Should this thread prefer event based over usermodel based extractors?
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is false. 
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is false.
    */
 	public static boolean getThreadPrefersEventExtractors() {
 	   return threadPreferEventExtractors.get();
 	}
-   /** 
-    * Should all threads prefer event based over usermodel based extractors? 
-    * (usermodel extractors tend to be more accurate, but use more memory) 
-    * Default is to use the thread level setting, which defaults to false. 
+   /**
+    * Should all threads prefer event based over usermodel based extractors?
+    * (usermodel extractors tend to be more accurate, but use more memory)
+    * Default is to use the thread level setting, which defaults to false.
    */
 	public static Boolean getAllThreadsPreferEventExtractors() {
 	   return allPreferEventExtractors;
 	}
-	
-   /** 
+
+   /**
    * Should this thread prefer event based over usermodel based extractors?
-    * Will only be used if the All Threads setting is null. 
+    * Will only be used if the All Threads setting is null.
    */
   public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
      threadPreferEventExtractors.set(preferEventExtractors);
   }
-   /** 
+   /**
    * Should all threads prefer event based over usermodel based extractors?
-    * If set, will take preference over the Thread level setting. 
+    * If set, will take preference over the Thread level setting.
    */
   public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
      allPreferEventExtractors = preferEventExtractors;
   }
-	
-   
+
+
   /**
    * Should this thread use event based extractors is available?
    * Checks the all-threads one first, then thread specific.
@ -118,8 +118,8 @@ public class ExtractorFactory {
      }
      return threadPreferEventExtractors.get();
   }
-   
-	
+
+
 	public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 		InputStream inp = null;
        try {
@ -137,14 +137,14 @@ public class ExtractorFactory {
            if(inp != null) inp.close();
        }
    }
-	
+
 	public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 		// Figure out the kind of stream
 		// If clearly doesn't do mark/reset, wrap up
 		if(! inp.markSupported()) {
 			inp = new PushbackInputStream(inp, 8);
 		}
-		
+
 		if(POIFSFileSystem.hasPOIFSHeader(inp)) {
 			return createExtractor(new POIFSFileSystem(inp));
 		}
@ -153,16 +153,16 @@ public class ExtractorFactory {
 		}
 		throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
 	}
-	
+
 	public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
-       PackageRelationshipCollection core = 
+       PackageRelationshipCollection core =
            pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
       if(core.size() != 1) {
          throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
       }

       PackagePart corePart = pkg.getPart(core.getRelationship(0));
-        
+
       // Is it XSSF?
       for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
          if(corePart.getContentType().equals(rel.getContentType())) {
@ -173,84 +173,98 @@ public class ExtractorFactory {
             }
          }
       }
-        
+
       // Is it XWPF?
       for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
          if(corePart.getContentType().equals(rel.getContentType())) {
             return new XWPFWordExtractor(pkg);
          }
       }
-       
+
       // Is it XSLF?
       for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
          if(corePart.getContentType().equals(rel.getContentType())) {
             return new XSLFPowerPointExtractor(pkg);
          }
       }
-       
+
       throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
 	}
-	
+
 	public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
 	   // Only ever an OLE2 one from the root of the FS
-		return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
+		return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
 	}
-	public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
-		// Look for certain entries in the stream, to figure it
-		//  out from
-		for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
-			Entry entry = entries.next();
-			
-			if(entry.getName().equals("Workbook")) {
-			   if(getPreferEventExtractor()) {
-               return new EventBasedExcelExtractor(poifsDir, fs);
-			   } else {
-			      return new ExcelExtractor(poifsDir, fs);
-			   }
-			}
-			if(entry.getName().equals("WordDocument")) {
-			    // Old or new style word document?
-			    try {
-			        return new WordExtractor(poifsDir, fs);
-			    } catch(OldWordFileFormatException e) {
-			        return new Word6Extractor(poifsDir, fs);
-			    }
-			}
-			if(entry.getName().equals("PowerPoint Document")) {
-				return new PowerPointExtractor(poifsDir, fs);
-			}
-			if(entry.getName().equals("VisioDocument")) {
-				return new VisioTextExtractor(poifsDir, fs);
-			}
-         if(entry.getName().equals("Quill")) {
-            return new PublisherTextExtractor(poifsDir, fs);
-         }
-			if(
-                entry.getName().equals("__substg1.0_1000001E") ||
-                entry.getName().equals("__substg1.0_1000001F") ||
-                entry.getName().equals("__substg1.0_0047001E") ||
-                entry.getName().equals("__substg1.0_0047001F") ||
-                entry.getName().equals("__substg1.0_0037001E") ||
-                entry.getName().equals("__substg1.0_0037001F")
-			) {
-			   return new OutlookTextExtactor(poifsDir, fs);
-			}
-			if(entry.getName().equals("Package")) {
-			   OPCPackage pkg = OPCPackage.open(
-			         poifsDir.createDocumentInputStream(entry.getName())
-			   );
-			   return createExtractor(pkg);
-			}
-		}
-		throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
-	}
-	
-	
+
+    /**
+     * @deprecated Use {@link #createExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings("unused")
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
+            throws IOException, InvalidFormatException, OpenXML4JException, XmlException
+    {
+        return createExtractor(poifsDir);
+    }
+
+    public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
+            InvalidFormatException, OpenXML4JException, XmlException
+    {
+        // Look for certain entries in the stream, to figure it
+        // out from
+        if (poifsDir.hasEntry("Workbook")) {
+            if (getPreferEventExtractor()) {
+                return new EventBasedExcelExtractor(poifsDir);
+            }
+            return new ExcelExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("WordDocument")) {
+            // Old or new style word document?
+            try {
+                return new WordExtractor(poifsDir);
+            } catch (OldWordFileFormatException e) {
+                return new Word6Extractor(poifsDir);
+            }
+        }
+
+        if (poifsDir.hasEntry("PowerPoint Document")) {
+            return new PowerPointExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("VisioDocument")) {
+            return new VisioTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("Quill")) {
+            return new PublisherTextExtractor(poifsDir);
+        }
+
+        if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
+                || poifsDir.hasEntry("__substg1.0_0047001E")
+                || poifsDir.hasEntry("__substg1.0_0047001F")
+                || poifsDir.hasEntry("__substg1.0_0037001E")
+                || poifsDir.hasEntry("__substg1.0_0037001F"))
+        {
+            return new OutlookTextExtactor(poifsDir);
+        }
+
+        for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
+            Entry entry = entries.next();
+
+            if (entry.getName().equals("Package")) {
+                OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
+                return createExtractor(pkg);
+            }
+        }
+        throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+    }
+
 	/**
 	 * Returns an array of text extractors, one for each of
 	 *  the embeded documents in the file (if there are any).
 	 * If there are no embeded documents, you'll get back an
-	 *  empty array. Otherwise, you'll get one open 
+	 *  empty array. Otherwise, you'll get one open
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
 	public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
@ -258,16 +272,16 @@ public class ExtractorFactory {
 		ArrayList<Entry> dirs = new ArrayList<Entry>();
 		// For anything else not directly held in as a POIFS directory
 		ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
-		
+
      // Find all the embeded directories
-		POIFSFileSystem fs = ext.getFileSystem();
-		if(fs == null) {
+		DirectoryEntry root = ext.getRoot();
+		if(root == null) {
 			throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
 		}
-		
+
 		if(ext instanceof ExcelExtractor) {
 			// These are in MBD... under the root
-			Iterator<Entry> it = fs.getRoot().getEntries();
+			Iterator<Entry> it = root.getEntries();
 			while(it.hasNext()) {
 				Entry entry = it.next();
 				if(entry.getName().startsWith("MBD")) {
@ -278,7 +292,7 @@ public class ExtractorFactory {
 			// These are in ObjectPool -> _... under the root
 			try {
 				DirectoryEntry op = (DirectoryEntry)
-					fs.getRoot().getEntry("ObjectPool");
+				        root.getEntry("ObjectPool");
 				Iterator<Entry> it = op.getEntries();
 				while(it.hasNext()) {
 					Entry entry = it.next();
@ -302,7 +316,7 @@ public class ExtractorFactory {
 		      }
 		   }
 		}
-		
+
 		// Create the extractors
 		if(
 		      (dirs == null || dirs.size() == 0) &&
@ -310,11 +324,11 @@ public class ExtractorFactory {
 		){
 			return new POITextExtractor[0];
 		}
-		
+
 		ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
 		for(int i=0; i<dirs.size(); i++) {
 			e.add( createExtractor(
-					(DirectoryNode)dirs.get(i), ext.getFileSystem()
+					(DirectoryNode)dirs.get(i)
 			) );
 		}
 		for(int i=0; i<nonPOIFS.size(); i++) {
@ -336,7 +350,7 @@ public class ExtractorFactory {
 	 * Returns an array of text extractors, one for each of
 	 *  the embeded documents in the file (if there are any).
 	 * If there are no embeded documents, you'll get back an
-	 *  empty array. Otherwise, you'll get one open 
+	 *  empty array. Otherwise, you'll get one open
 	 *  {@link POITextExtractor} for each embeded file.
 	 */
 	public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocument.java
@ -23,6 +23,8 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;

+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.hwpf.model.BookmarksTables;
 import org.apache.poi.hwpf.model.CHPBinTable;
 import org.apache.poi.hwpf.model.CPSplitCalculator;
@ -190,7 +192,9 @@ public final class HWPFDocument extends HWPFDocumentCore
   * @param pfilesystem The POIFSFileSystem that contains the Word document.
   * @throws IOException If there is an unexpected IOException from the passed
   *         in POIFSFileSystem.
+   * @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
   */
+  @Deprecated
  public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
  {
     this(directory);
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFDocumentCore.java
@ -17,10 +17,17 @@

 package org.apache.poi.hwpf;

+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;

+import org.apache.poi.hwpf.usermodel.ObjectsPool;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
 import org.apache.poi.EncryptedDocumentException;
 import org.apache.poi.POIDocument;
 import org.apache.poi.hwpf.model.CHPBinTable;
@ -46,6 +53,9 @@ import org.apache.poi.util.Internal;
 */
 public abstract class HWPFDocumentCore extends POIDocument
 {
+  /** Holds OLE2 objects */
+  protected ObjectPoolImpl _objectPool;
+
  /** The FIB */
  protected FileInformationBlock _fib;

@ -148,7 +158,21 @@ public abstract class HWPFDocumentCore extends POIDocument
    if(_fib.isFEncrypted()) {
    	throw new EncryptedDocumentException("Cannot process encrypted word files!");
    }
-  }
+
+        {
+            DirectoryEntry objectPoolEntry;
+            try
+            {
+                objectPoolEntry = (DirectoryEntry) directory
+                        .getEntry( "ObjectPool" );
+            }
+            catch ( FileNotFoundException exc )
+            {
+                objectPoolEntry = directory.createDirectory( "ObjectPool" );
+            }
+            _objectPool = new ObjectPoolImpl( objectPoolEntry );
+        }
+    }

    /**
     * Returns the range which covers the whole of the document, but excludes
@ -211,5 +235,10 @@ public abstract class HWPFDocumentCore extends POIDocument
    return _fib;
  }

+    public ObjectsPool getObjectsPool()
+    {
+        return _objectPool;
+    }
+
    public abstract TextPieceTable getTextTable();
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/HWPFOldDocument.java
@ -44,6 +44,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
        this(fs.getRoot());
    }

+    @Deprecated
    public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
            throws IOException {
       this(directory);
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
@ -47,6 +47,7 @@ import org.apache.poi.hwpf.usermodel.Section;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
@ -56,6 +57,32 @@ import org.w3c.dom.Element;
@Beta
 public abstract class AbstractWordConverter
 {
+    private static final class Structure implements Comparable<Structure>
+    {
+        final int end;
+        final int start;
+        final Object structure;
+
+        Structure( Bookmark bookmark )
+        {
+            this.start = bookmark.getStart();
+            this.end = bookmark.getEnd();
+            this.structure = bookmark;
+        }
+
+        Structure( Field field )
+        {
+            this.start = field.getFieldStartOffset();
+            this.end = field.getFieldEndOffset();
+            this.structure = field;
+        }
+
+        public int compareTo( Structure o )
+        {
+            return start < o.start ? -1 : start == o.start ? 0 : 1;
+        }
+    }
+
    private static final byte BEL_MARK = 7;

    private static final byte FIELD_BEGIN_MARK = 19;
@ -396,6 +423,13 @@ public abstract class AbstractWordConverter
                    processDrawnObject( doc, characterRun, block );
                    continue;
                }
+                if ( characterRun.isOle2()
+                        && ( wordDocument instanceof HWPFDocument ) )
+                {
+                    HWPFDocument doc = (HWPFDocument) wordDocument;
+                    processOle2( doc, characterRun, block );
+                    continue;
+                }
            }

            if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
@ -613,10 +647,11 @@ public abstract class AbstractWordConverter
            CharacterRun characterRun, OfficeDrawing officeDrawing,
            String path, Element block );

-    protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range endnoteTextRange );
+    protected abstract void processEndnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range endnoteTextRange );

-    protected void processField( HWPFDocument hwpfDocument, Range parentRange,
+    protected void processField( HWPFDocument wordDocument, Range parentRange,
            int currentTableLevel, Field field, Element currentBlock )
    {
        switch ( field.getType() )
@ -633,7 +668,7 @@ public abstract class AbstractWordConverter
                if ( matcher.find() )
                {
                    String pageref = matcher.group( 1 );
-                    processPageref( hwpfDocument, currentBlock,
+                    processPageref( wordDocument, currentBlock,
                            field.secondSubrange( parentRange ),
                            currentTableLevel, pageref );
                    return;
@ -641,6 +676,36 @@ public abstract class AbstractWordConverter
            }
            break;
        }
+        case 58: // Embedded Object
+        {
+            if ( !field.hasSeparator() )
+            {
+                logger.log( POILogger.WARN, parentRange + " contains " + field
+                        + " with 'Embedded Object' but without separator mark" );
+                return;
+            }
+
+            CharacterRun separator = field
+                    .getMarkSeparatorCharacterRun( parentRange );
+
+            if ( separator.isOle2() )
+            {
+                // the only supported so far
+                boolean processed = processOle2( wordDocument, separator,
+                        currentBlock );
+
+                // if we didn't output OLE - output field value
+                if ( !processed )
+                {
+                    processCharacters( wordDocument, currentTableLevel,
+                            field.secondSubrange( parentRange ), currentBlock );
+                }
+
+                return;
+            }
+
+            break;
+        }
        case 88: // hyperlink
        {
            final Range firstSubrange = field.firstSubrange( parentRange );
@ -653,7 +718,7 @@ public abstract class AbstractWordConverter
                if ( matcher.find() )
                {
                    String hyperlink = matcher.group( 1 );
-                    processHyperlink( hwpfDocument, currentBlock,
+                    processHyperlink( wordDocument, currentBlock,
                            field.secondSubrange( parentRange ),
                            currentTableLevel, hyperlink );
                    return;
@ -665,12 +730,13 @@ public abstract class AbstractWordConverter

        logger.log( POILogger.WARN, parentRange + " contains " + field
                + " with unsupported type or format" );
-        processCharacters( hwpfDocument, currentTableLevel,
+        processCharacters( wordDocument, currentTableLevel,
                field.secondSubrange( parentRange ), currentBlock );
    }

-    protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
-            int noteIndex, Element block, Range footnoteTextRange );
+    protected abstract void processFootnoteAutonumbered(
+            HWPFDocument wordDocument, int noteIndex, Element block,
+            Range footnoteTextRange );

    protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
@ -732,6 +798,40 @@ public abstract class AbstractWordConverter
        }
    }

+    private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
+            Element block )
+    {
+        Entry entry = doc.getObjectsPool().getObjectById(
+                "_" + characterRun.getPicOffset() );
+        if ( entry == null )
+        {
+            logger.log( POILogger.WARN, "Referenced OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ),
+                    "' not found in ObjectPool" );
+            return false;
+        }
+
+        try
+        {
+            return processOle2( doc, block, entry );
+        }
+        catch ( Exception exc )
+        {
+            logger.log( POILogger.WARN,
+                    "Unable to convert internal OLE2 object '",
+                    Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
+                    exc );
+            return false;
+        }
+    }
+
+    @SuppressWarnings( "unused" )
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        return false;
+    }
+
    protected abstract void processPageref( HWPFDocumentCore wordDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
            String pageref );
@ -896,30 +996,4 @@ public abstract class AbstractWordConverter
        return endMark;
    }

-    private static final class Structure implements Comparable<Structure>
-    {
-        final int end;
-        final int start;
-        final Object structure;
-
-        Structure( Bookmark bookmark )
-        {
-            this.start = bookmark.getStart();
-            this.end = bookmark.getEnd();
-            this.structure = bookmark;
-        }
-
-        Structure( Field field )
-        {
-            this.start = field.getFieldStartOffset();
-            this.end = field.getFieldEndOffset();
-            this.structure = field;
-        }
-
-        public int compareTo( Structure o )
-        {
-            return start < o.start ? -1 : start == o.start ? 0 : 1;
-        }
-    }
-
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/AbstractWordUtils.java
@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.Beta;
 import org.apache.poi.util.IOUtils;
@ -422,6 +423,19 @@ public class AbstractWordUtils
        return !isEmpty( str );
    }

+    public static HWPFDocumentCore loadDoc( final DirectoryNode root )
+            throws IOException
+    {
+        try
+        {
+            return new HWPFDocument( root );
+        }
+        catch ( OldWordFileFormatException exc )
+        {
+            return new HWPFOldDocument( root );
+        }
+    }
+
    public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
    {
        final FileInputStream istream = new FileInputStream( docFile );
@ -438,16 +452,13 @@ public class AbstractWordUtils
    public static HWPFDocumentCore loadDoc( InputStream inputStream )
            throws IOException
    {
-        final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
-                .verifyAndBuildPOIFS( inputStream );
-        try
-        {
-            return new HWPFDocument( poifsFileSystem );
-        }
-        catch ( OldWordFileFormatException exc )
-        {
-            return new HWPFOldDocument( poifsFileSystem );
-        }
+        return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
+    }
+
+    public static HWPFDocumentCore loadDoc(
+            final POIFSFileSystem poifsFileSystem ) throws IOException
+    {
+        return loadDoc( poifsFileSystem.getRoot() );
    }

    static String substringBeforeLast( String str, String separator )
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToFoConverter.java
@ -276,8 +276,8 @@ public class WordToFoConverter extends AbstractWordConverter
    }

    @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
    {
        final String textIndex = String.valueOf( internalLinkCounter
                .incrementAndGet() );
@ -297,7 +297,8 @@ public class WordToFoConverter extends AbstractWordConverter
        setId( backwardLink, forwardLinkName );
        endnote.appendChild( backwardLink );

-        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
+        processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
+                endnote );

        WordToFoUtils.compactInlines( endnote );
        this.endnotes.add( endnote );
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToHtmlConverter.java
@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
@Beta
 public class WordToHtmlConverter extends AbstractWordConverter
 {
-
    /**
     * Holds properties values, applied to current <tt>p</tt> element. Those
     * properties shall not be doubled in children <tt>span</tt> elements.
@ -282,10 +281,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
    }

    @Override
-    protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
-            Element block, Range endnoteTextRange )
+    protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+            int noteIndex, Element block, Range endnoteTextRange )
    {
-        processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
+        processNoteAutonumbered( wordDocument, "end", noteIndex, block,
+                endnoteTextRange );
    }

    @Override
--- a/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/converter/WordToTextConverter.java
@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter;

 import java.io.File;
 import java.io.FileWriter;
+import java.io.StringWriter;
+import java.lang.reflect.Method;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;

+import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerFactory;
@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Section;
 import org.apache.poi.hwpf.usermodel.Table;
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.util.Beta;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@ -33,6 +39,29 @@ import org.w3c.dom.Element;
 public class WordToTextConverter extends AbstractWordConverter
 {

+    public static String getText( DirectoryNode root ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
+        return getText( wordDocument );
+    }
+
+    public static String getText( File docFile ) throws Exception
+    {
+        final HWPFDocumentCore wordDocument = AbstractWordUtils
+                .loadDoc( docFile );
+        return getText( wordDocument );
+    }
+
+    public static String getText( final HWPFDocumentCore wordDocument )
+            throws Exception
+    {
+        WordToTextConverter wordToTextConverter = new WordToTextConverter(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+        wordToTextConverter.processDocument( wordDocument );
+        return wordToTextConverter.getText();
+    }
+
    /**
     * Java main() interface to interact with {@link WordToTextConverter}
     * 
@ -91,8 +120,24 @@ public class WordToTextConverter extends AbstractWordConverter

    private Element notes = null;

+    private boolean outputSummaryInformation = false;
+
    private final TextDocumentFacade textDocumentFacade;

+    /**
+     * Creates new instance of {@link WordToTextConverter}. Can be used for
+     * output several {@link HWPFDocument}s into single text document.
+     * 
+     * @throws ParserConfigurationException
+     *             if an internal {@link DocumentBuilder} cannot be created
+     */
+    public WordToTextConverter() throws ParserConfigurationException
+    {
+        this.textDocumentFacade = new TextDocumentFacade(
+                DocumentBuilderFactory.newInstance().newDocumentBuilder()
+                        .newDocument() );
+    }
+
    /**
     * Creates new instance of {@link WordToTextConverter}. Can be used for
     * output several {@link HWPFDocument}s into single text document.
@ -110,6 +155,28 @@ public class WordToTextConverter extends AbstractWordConverter
        return textDocumentFacade.getDocument();
    }

+    public String getText() throws Exception
+    {
+        StringWriter stringWriter = new StringWriter();
+        DOMSource domSource = new DOMSource( getDocument() );
+        StreamResult streamResult = new StreamResult( stringWriter );
+
+        TransformerFactory tf = TransformerFactory.newInstance();
+        Transformer serializer = tf.newTransformer();
+        // TODO set encoding from a command argument
+        serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+        serializer.setOutputProperty( OutputKeys.INDENT, "no" );
+        serializer.setOutputProperty( OutputKeys.METHOD, "text" );
+        serializer.transform( domSource, streamResult );
+
+        return stringWriter.toString();
+    }
+
+    public boolean isOutputSummaryInformation()
+    {
+        return outputSummaryInformation;
+    }
+
    @Override
    protected void outputCharacters( Element block, CharacterRun characterRun,
            String text )
@ -138,18 +205,24 @@ public class WordToTextConverter extends AbstractWordConverter
    protected void processDocumentInformation(
            SummaryInformation summaryInformation )
    {
-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
-            textDocumentFacade.setTitle( summaryInformation.getTitle() );
+        if ( isOutputSummaryInformation() )
+        {
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
+                textDocumentFacade.setTitle( summaryInformation.getTitle() );

-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
-            textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
+            if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
+                textDocumentFacade.addAuthor( summaryInformation.getAuthor() );

-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
-            textDocumentFacade
-                    .addDescription( summaryInformation.getComments() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getComments() ) )
+                textDocumentFacade.addDescription( summaryInformation
+                        .getComments() );

-        if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
-            textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
+            if ( AbstractWordUtils
+                    .isNotEmpty( summaryInformation.getKeywords() ) )
+                textDocumentFacade.addKeywords( summaryInformation
+                        .getKeywords() );
+        }
    }

    @Override
@ -222,6 +295,48 @@ public class WordToTextConverter extends AbstractWordConverter
        note.appendChild( textDocumentFacade.createText( "\n" ) );
    }

+    @Override
+    protected boolean processOle2( HWPFDocument wordDocument, Element block,
+            Entry entry ) throws Exception
+    {
+        if ( !( entry instanceof DirectoryNode ) )
+            return false;
+        DirectoryNode directoryNode = (DirectoryNode) entry;
+
+        // even if no ExtractorFactory in classpath
+        if ( directoryNode.hasEntry( "WordDocument" ) )
+        {
+            String text = WordToTextConverter.getText( (DirectoryNode) entry );
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+
+        try
+        {
+            Class<?> cls = Class
+                    .forName( "org.apache.poi.extractor.ExtractorFactory" );
+            Method createExtractor = cls.getMethod( "createExtractor",
+                    DirectoryNode.class );
+            Object extractor = createExtractor.invoke( null, directoryNode );
+
+            Method getText = extractor.getClass().getMethod( "getText" );
+            String text = (String) getText.invoke( extractor );
+
+            block.appendChild( textDocumentFacade
+                    .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+                            + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+            return true;
+        }
+        catch ( ClassNotFoundException exc )
+        {
+            // no extractor in classpath
+        }
+
+        return false;
+    }
+
    @Override
    protected void processPageref( HWPFDocumentCore wordDocument,
            Element currentBlock, Range textRange, int currentTableLevel,
@ -254,7 +369,7 @@ public class WordToTextConverter extends AbstractWordConverter
        textDocumentFacade.body.appendChild( sectionElement );
    }

-    protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+    protected void processTable( HWPFDocumentCore wordDocument, Element flow,
            Table table )
    {
        final int tableRows = table.numRows();
@ -275,8 +390,8 @@ public class WordToTextConverter extends AbstractWordConverter
                    tableCellElement.appendChild( textDocumentFacade
                            .createText( "\t" ) );

-                processParagraphes( hwpfDocument, tableCellElement, tableCell,
-                        table.getTableLevel() );
+                processCharacters( wordDocument, table.getTableLevel(),
+                        tableCell, tableCellElement );
                tableRowElement.appendChild( tableCellElement );
            }

@ -285,4 +400,9 @@ public class WordToTextConverter extends AbstractWordConverter
        }
    }

+    public void setOutputSummaryInformation( boolean outputDocumentInformation )
+    {
+        this.outputSummaryInformation = outputDocumentInformation;
+    }
+
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/Word6Extractor.java
@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor;

 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+import org.apache.poi.hwpf.usermodel.HeaderStories;

 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFOldDocument;
@ -47,16 +51,32 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
 		this( new POIFSFileSystem(is) );
 	}

-	/**
-	 * Create a new Word Extractor
-	 * @param fs POIFSFileSystem containing the word file
-	 */
-	public Word6Extractor(POIFSFileSystem fs) throws IOException {
-		this(fs.getRoot(), fs);
-	}
-	public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-	    this(new HWPFOldDocument(dir,fs));
-	}
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public Word6Extractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( fs.getRoot() );
+    }
+
+    /**
+     * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    @SuppressWarnings( "unused" )
+    public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }
+
+    public Word6Extractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFOldDocument( dir ) );
+    }

 	/**
 	 * Create a new Word Extractor
@ -71,6 +91,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
     * Get the text from the word file, as an array with one String
     *  per paragraph
     */
+	@Deprecated
 	public String[] getParagraphText() {
 	    String[] ret;

@ -95,13 +116,25 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
 	    return ret;
 	}

-    public String getText() {
-        StringBuffer text = new StringBuffer();
-        
-        for(String t : getParagraphText()) {
-            text.append(t);
+    public String getText()
+    {
+        try
+        {
+            WordToTextConverter wordToTextConverter = new WordToTextConverter();
+            wordToTextConverter.processDocument( doc );
+            return wordToTextConverter.getText();
        }
+        catch ( Exception exc )
+        {
+            // fall-back
+            StringBuffer text = new StringBuffer();

-        return text.toString();
+            for ( String t : getParagraphText() )
+            {
+                text.append( t );
+            }
+
+            return text.toString();
+        }
    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;

+import org.apache.poi.hwpf.converter.WordToTextConverter;
+
 import org.apache.poi.POIOLE2TextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.usermodel.HeaderStories;
@ -33,231 +36,300 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
 * Class to extract the text from a Word Document.
- *
- * You should use either getParagraphText() or getText() unless
- *  you have a strong reason otherwise.
- *
+ * 
+ * You should use either getParagraphText() or getText() unless you have a
+ * strong reason otherwise.
+ * 
 * @author Nick Burch
 */
-public final class WordExtractor extends POIOLE2TextExtractor {
-	private POIFSFileSystem fs;
-	private HWPFDocument doc;
+public final class WordExtractor extends POIOLE2TextExtractor
+{
+    private HWPFDocument doc;

-	/**
-	 * Create a new Word Extractor
-	 * @param is InputStream containing the word file
-	 */
-	public WordExtractor(InputStream is) throws IOException {
-		this( HWPFDocument.verifyAndBuildPOIFS(is) );
-	}
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param is
+     *            InputStream containing the word file
+     */
+    public WordExtractor( InputStream is ) throws IOException
+    {
+        this( HWPFDocument.verifyAndBuildPOIFS( is ) );
+    }

-	/**
-	 * Create a new Word Extractor
-	 * @param fs POIFSFileSystem containing the word file
-	 */
-	public WordExtractor(POIFSFileSystem fs) throws IOException {
-		this(new HWPFDocument(fs));
-		this.fs = fs;
-	}
-	public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
-		this(new HWPFDocument(dir, fs));
-		this.fs = fs;
-	}
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param fs
+     *            POIFSFileSystem containing the word file
+     */
+    public WordExtractor( POIFSFileSystem fs ) throws IOException
+    {
+        this( new HWPFDocument( fs ) );
+    }

-	/**
-	 * Create a new Word Extractor
-	 * @param doc The HWPFDocument to extract from
-	 */
-	public WordExtractor(HWPFDocument doc) {
-		super(doc);
-		this.doc = doc;
-	}
+    /**
+     * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
+     */
+    @Deprecated
+    public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
+            throws IOException
+    {
+        this( dir );
+    }

-	/**
-	 * Command line extractor, so people will stop moaning that
-	 *  they can't just run this.
-	 */
-	public static void main(String[] args) throws IOException {
-		if(args.length == 0) {
-			System.err.println("Use:");
-			System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
-			System.exit(1);
-		}
+    public WordExtractor( DirectoryNode dir ) throws IOException
+    {
+        this( new HWPFDocument( dir ) );
+    }

-		// Process the first argument as a file
-		FileInputStream fin = new FileInputStream(args[0]);
-		WordExtractor extractor = new WordExtractor(fin);
-		System.out.println(extractor.getText());
-	}
+    /**
+     * Create a new Word Extractor
+     * 
+     * @param doc
+     *            The HWPFDocument to extract from
+     */
+    public WordExtractor( HWPFDocument doc )
+    {
+        super( doc );
+        this.doc = doc;
+    }

-	/**
-	 * Get the text from the word file, as an array with one String
-	 *  per paragraph
-	 */
-        public String[] getParagraphText() {
-                String[] ret;
+    /**
+     * Command line extractor, so people will stop moaning that they can't just
+     * run this.
+     */
+    public static void main( String[] args ) throws IOException
+    {
+        if ( args.length == 0 )
+        {
+            System.err.println( "Use:" );
+            System.err
+                    .println( "   java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
+            System.exit( 1 );
+        }

-                // Extract using the model code
-                try {
-                        Range r = doc.getRange();
+        // Process the first argument as a file
+        FileInputStream fin = new FileInputStream( args[0] );
+        WordExtractor extractor = new WordExtractor( fin );
+        System.out.println( extractor.getText() );
+    }

-                        ret = getParagraphText(r);
-                } catch (Exception e) {
-                        // Something's up with turning the text pieces into paragraphs
-                        // Fall back to ripping out the text pieces
-                        ret = new String[1];
-                        ret[0] = getTextFromPieces();
+    /**
+     * Get the text from the word file, as an array with one String per
+     * paragraph
+     */
+    public String[] getParagraphText()
+    {
+        String[] ret;
+
+        // Extract using the model code
+        try
+        {
+            Range r = doc.getRange();
+
+            ret = getParagraphText( r );
+        }
+        catch ( Exception e )
+        {
+            // Something's up with turning the text pieces into paragraphs
+            // Fall back to ripping out the text pieces
+            ret = new String[1];
+            ret[0] = getTextFromPieces();
+        }
+
+        return ret;
+    }
+
+    public String[] getFootnoteText()
+    {
+        Range r = doc.getFootnoteRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getMainTextboxText()
+    {
+        Range r = doc.getMainTextboxRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getEndnoteText()
+    {
+        Range r = doc.getEndnoteRange();
+
+        return getParagraphText( r );
+    }
+
+    public String[] getCommentsText()
+    {
+        Range r = doc.getCommentsRange();
+
+        return getParagraphText( r );
+    }
+
+    protected static String[] getParagraphText( Range r )
+    {
+        String[] ret;
+        ret = new String[r.numParagraphs()];
+        for ( int i = 0; i < ret.length; i++ )
+        {
+            Paragraph p = r.getParagraph( i );
+            ret[i] = p.text();
+
+            // Fix the line ending
+            if ( ret[i].endsWith( "\r" ) )
+            {
+                ret[i] = ret[i] + "\n";
+            }
+        }
+        return ret;
+    }
+
+    /**
+     * Add the header/footer text, if it's not empty
+     */
+    private void appendHeaderFooter( String text, StringBuffer out )
+    {
+        if ( text == null || text.length() == 0 )
+            return;
+
+        text = text.replace( '\r', '\n' );
+        if ( !text.endsWith( "\n" ) )
+        {
+            out.append( text );
+            out.append( '\n' );
+            return;
+        }
+        if ( text.endsWith( "\n\n" ) )
+        {
+            out.append( text.substring( 0, text.length() - 1 ) );
+            return;
+        }
+        out.append( text );
+        return;
+    }
+
+    /**
+     * Grab the text from the headers
+     */
+    @Deprecated
+    public String getHeaderText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstHeader() != null )
+        {
+            appendHeaderFooter( hs.getFirstHeader(), ret );
+        }
+        if ( hs.getEvenHeader() != null )
+        {
+            appendHeaderFooter( hs.getEvenHeader(), ret );
+        }
+        if ( hs.getOddHeader() != null )
+        {
+            appendHeaderFooter( hs.getOddHeader(), ret );
+        }
+
+        return ret.toString();
+    }
+
+    /**
+     * Grab the text from the footers
+     */
+    @Deprecated
+    public String getFooterText()
+    {
+        HeaderStories hs = new HeaderStories( doc );
+
+        StringBuffer ret = new StringBuffer();
+        if ( hs.getFirstFooter() != null )
+        {
+            appendHeaderFooter( hs.getFirstFooter(), ret );
+        }
+        if ( hs.getEvenFooter() != null )
+        {
+            appendHeaderFooter( hs.getEvenFooter(), ret );
+        }
+        if ( hs.getOddFooter() != null )
+        {
+            appendHeaderFooter( hs.getOddFooter(), ret );
+        }
+
+        return ret.toString();
+    }
+
+    /**
+     * Grab the text out of the text pieces. Might also include various bits of
+     * crud, but will work in cases where the text piece -> paragraph mapping is
+     * broken. Fast too.
+     */
+    public String getTextFromPieces()
+    {
+        String text = doc.getDocumentText();
+
+        // Fix line endings (Note - won't get all of them
+        text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
+        text = text.replaceAll( "\r\r", "\r\n\r\n" );
+
+        if ( text.endsWith( "\r" ) )
+        {
+            text += "\n";
+        }
+
+        return text;
+    }
+
+    /**
+     * Grab the text, based on the WordToTextConverter. Shouldn't include any
+     * crud, but slower than getTextFromPieces().
+     */
+    public String getText()
+    {
+        try
+        {
+            final StringWriter stringWriter = new StringWriter();
+            @SuppressWarnings( "unused" )
+            WordToTextConverter wordToTextConverter = new WordToTextConverter()
+            {
+                {
+                    HeaderStories hs = new HeaderStories( doc );
+
+                    if ( hs.getFirstHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstHeaderSubrange() );
+                    if ( hs.getEvenHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenHeaderSubrange() );
+                    if ( hs.getOddHeaderSubrange() != null )
+                        processDocumentPart( doc, hs.getOddHeaderSubrange() );
+
+                    processDocument( doc );
+                    processDocumentPart( doc, doc.getMainTextboxRange() );
+
+                    if ( hs.getFirstFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getFirstFooterSubrange() );
+                    if ( hs.getEvenFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getEvenFooterSubrange() );
+                    if ( hs.getOddFooterSubrange() != null )
+                        processDocumentPart( doc, hs.getOddFooterSubrange() );
+
+                    stringWriter.append( getText() );
                }
-
-                return ret;
+            };
+            return stringWriter.toString();
        }
-
-        public String[] getFootnoteText() {
-                Range r = doc.getFootnoteRange();
-
-                return getParagraphText(r);
+        catch ( Exception exc )
+        {
+            throw new RuntimeException( exc );
        }
+    }

-        public String[] getMainTextboxText() {
-                Range r = doc.getMainTextboxRange();
-
-                return getParagraphText(r);
-        }
-
-        public String[] getEndnoteText() {
-                Range r = doc.getEndnoteRange();
-
-                return getParagraphText(r);
-        }
-
-        public String[] getCommentsText() {
-                Range r = doc.getCommentsRange();
-
-                return getParagraphText(r);
-        }
-
-        protected static String[] getParagraphText(Range r) {
-                String[] ret;
-                ret = new String[r.numParagraphs()];
-                for (int i = 0; i < ret.length; i++) {
-                        Paragraph p = r.getParagraph(i);
-                        ret[i] = p.text();
-
-                        // Fix the line ending
-                        if (ret[i].endsWith("\r")) {
-                                ret[i] = ret[i] + "\n";
-                        }
-                }
-                return ret;
-        }
-
-        /**
-	 * Add the header/footer text, if it's not empty
-	 */
-	private void appendHeaderFooter(String text, StringBuffer out) {
-		if(text == null || text.length() == 0)
-			return;
-
-		text = text.replace('\r', '\n');
-		if(! text.endsWith("\n")) {
-			out.append(text);
-			out.append('\n');
-			return;
-		}
-		if(text.endsWith("\n\n")) {
-			out.append(text.substring(0, text.length()-1));
-			return;
-		}
-		out.append(text);
-		return;
-	}
-	/**
-	 * Grab the text from the headers
-	 */
-	public String getHeaderText() {
-		HeaderStories hs = new HeaderStories(doc);
-
-		StringBuffer ret = new StringBuffer();
-		if(hs.getFirstHeader() != null) {
-			appendHeaderFooter(hs.getFirstHeader(), ret);
-		}
-		if(hs.getEvenHeader() != null) {
-			appendHeaderFooter(hs.getEvenHeader(), ret);
-		}
-		if(hs.getOddHeader() != null) {
-			appendHeaderFooter(hs.getOddHeader(), ret);
-		}
-
-		return ret.toString();
-	}
-	/**
-	 * Grab the text from the footers
-	 */
-	public String getFooterText() {
-		HeaderStories hs = new HeaderStories(doc);
-
-		StringBuffer ret = new StringBuffer();
-		if(hs.getFirstFooter() != null) {
-			appendHeaderFooter(hs.getFirstFooter(), ret);
-		}
-		if(hs.getEvenFooter() != null) {
-			appendHeaderFooter(hs.getEvenFooter(), ret);
-		}
-		if(hs.getOddFooter() != null) {
-			appendHeaderFooter(hs.getOddFooter(), ret);
-		}
-
-		return ret.toString();
-	}
-
-	/**
-	 * Grab the text out of the text pieces. Might also include various
-	 *  bits of crud, but will work in cases where the text piece -> paragraph
-	 *  mapping is broken. Fast too.
-	 */
-	public String getTextFromPieces() {
-    	String text = doc.getDocumentText();
-
-    	// Fix line endings (Note - won't get all of them
-    	text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
-    	text = text.replaceAll("\r\r", "\r\n\r\n");
-
-    	if(text.endsWith("\r")) {
-    		text += "\n";
-    	}
-
-    	return text;
-	}
-
-	/**
-	 * Grab the text, based on the paragraphs. Shouldn't include any crud,
-	 *  but slightly slower than getTextFromPieces().
-	 */
-	public String getText() {
-	   StringBuffer ret = new StringBuffer();
-
-	   ret.append(getHeaderText());
-
-	   ArrayList<String> text = new ArrayList<String>();
-	   text.addAll(Arrays.asList(getParagraphText()));
-	   text.addAll(Arrays.asList(getMainTextboxText()));
-	   text.addAll(Arrays.asList(getFootnoteText()));
-	   text.addAll(Arrays.asList(getEndnoteText()));
-
-	   for(String p : text) {
-	      ret.append(p);
-	   }
-
-	   ret.append(getFooterText());
-
-	   return ret.toString();
-	}
-
-	/**
-	 * Removes any fields (eg macros, page markers etc)
-	 *  from the string.
-	 */
-	public static String stripFields(String text) {
-		return Range.stripFields(text);
-	}
+    /**
+     * Removes any fields (eg macros, page markers etc) from the string.
+     */
+    public static String stripFields( String text )
+    {
+        return Range.stripFields( text );
+    }
 }
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Field.java
@ -17,17 +17,23 @@ public interface Field
     */
    int getFieldStartOffset();

+    CharacterRun getMarkEndCharacterRun( Range parent );
+
    /**
     * @return character position of end field mark
     */
    int getMarkEndOffset();

+    CharacterRun getMarkSeparatorCharacterRun( Range parent );
+
    /**
     * @return character position of separator field mark (if present,
     *         {@link NullPointerException} otherwise)
     */
    int getMarkSeparatorOffset();

+    CharacterRun getMarkStartCharacterRun( Range parent );
+
    /**
     * @return character position of start field mark
     */
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/FieldImpl.java
@ -112,6 +112,12 @@ class FieldImpl implements Field
        return startPlex.getFcStart();
    }

+    public CharacterRun getMarkEndCharacterRun( Range parent )
+    {
+        return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
+                .getCharacterRun( 0 );
+    }
+
    /**
     * @return character position of end field mark
     */
@ -120,6 +126,15 @@ class FieldImpl implements Field
        return endPlex.getFcStart();
    }

+    public CharacterRun getMarkSeparatorCharacterRun( Range parent )
+    {
+        if ( !hasSeparator() )
+            return null;
+
+        return new Range( getMarkSeparatorOffset(),
+                getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
+    }
+
    /**
     * @return character position of separator field mark (if present,
     *         {@link NullPointerException} otherwise)
@ -129,6 +144,12 @@ class FieldImpl implements Field
        return separatorPlex.getFcStart();
    }

+    public CharacterRun getMarkStartCharacterRun( Range parent )
+    {
+        return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
+                parent ).getCharacterRun( 0 );
+    }
+
    /**
     * @return character position of start field mark
     */
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
@ -82,35 +82,96 @@ public final class HeaderStories {
                fib.getPlcfHddSize(), 0 );
    }

-	public String getFootnoteSeparator() {
-		return getAt(0);
-	}
-	public String getFootnoteContSeparator() {
-		return getAt(1);
-	}
-	public String getFootnoteContNote() {
-		return getAt(2);
-	}
-	public String getEndnoteSeparator() {
-		return getAt(3);
-	}
-	public String getEndnoteContSeparator() {
-		return getAt(4);
-	}
-	public String getEndnoteContNote() {
-		return getAt(5);
-	}
+    @Deprecated
+    public String getFootnoteSeparator()
+    {
+        return getAt( 0 );
+    }

+    @Deprecated
+    public String getFootnoteContSeparator()
+    {
+        return getAt( 1 );
+    }

+    @Deprecated
+    public String getFootnoteContNote()
+    {
+        return getAt( 2 );
+    }
+
+    @Deprecated
+    public String getEndnoteSeparator()
+    {
+        return getAt( 3 );
+    }
+
+    @Deprecated
+    public String getEndnoteContSeparator()
+    {
+        return getAt( 4 );
+    }
+
+    @Deprecated
+    public String getEndnoteContNote()
+    {
+        return getAt( 5 );
+    }
+
+    public Range getFootnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 0 );
+    }
+
+    public Range getFootnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 1 );
+    }
+
+    public Range getFootnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 2 );
+    }
+
+    public Range getEndnoteSeparatorSubrange()
+    {
+        return getSubrangeAt( 3 );
+    }
+
+    public Range getEndnoteContSeparatorSubrange()
+    {
+        return getSubrangeAt( 4 );
+    }
+
+    public Range getEndnoteContNoteSubrange()
+    {
+        return getSubrangeAt( 5 );
+    }
+
+	@Deprecated
 	public String getEvenHeader() {
 		return getAt(6+0);
 	}
+    @Deprecated
 	public String getOddHeader() {
 		return getAt(6+1);
 	}
+    @Deprecated
 	public String getFirstHeader() {
 		return getAt(6+4);
 	}
+	
+
+    public Range getEvenHeaderSubrange() {
+        return getSubrangeAt(6+0);
+    }
+    public Range getOddHeaderSubrange() {
+        return getSubrangeAt(6+1);
+    }
+    public Range getFirstHeaderSubrange() {
+        return getSubrangeAt(6+4);
+    }
+    
 	/**
 	 * Returns the correct, defined header for the given
 	 *  one based page
@ -135,16 +196,39 @@ public final class HeaderStories {
 		return getOddHeader();
 	}

+	@Deprecated
+    public String getEvenFooter()
+    {
+        return getAt( 6 + 2 );
+    }
+
+    @Deprecated
+    public String getOddFooter()
+    {
+        return getAt( 6 + 3 );
+    }
+
+    @Deprecated
+    public String getFirstFooter()
+    {
+        return getAt( 6 + 5 );
+    }
+
+    public Range getEvenFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 2 );
+    }
+
+    public Range getOddFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 3 );
+    }
+
+    public Range getFirstFooterSubrange()
+    {
+        return getSubrangeAt( 6 + 5 );
+    }

-	public String getEvenFooter() {
-		return getAt(6+2);
-	}
-	public String getOddFooter() {
-		return getAt(6+3);
-	}
-	public String getFirstFooter() {
-		return getAt(6+5);
-	}
 	/**
 	 * Returns the correct, defined footer for the given
 	 *  one based page
@ -174,6 +258,7 @@ public final class HeaderStories {
 	 * Get the string that's pointed to by the
 	 *  given plcfHdd index
 	 */
+    @Deprecated
 	private String getAt(int plcfHddIndex) {
 		if(plcfHdd == null) return null;

@ -209,6 +294,32 @@ public final class HeaderStories {
 		return text;
 	}

+    private Range getSubrangeAt( int plcfHddIndex )
+    {
+        if ( plcfHdd == null )
+            return null;
+
+        GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
+        if ( prop.getStart() == prop.getEnd() )
+        {
+            // Empty story
+            return null;
+        }
+        if ( prop.getEnd() < prop.getStart() )
+        {
+            // Broken properties?
+            return null;
+        }
+
+        final int headersLength = headerStories.getEndOffset()
+                - headerStories.getStartOffset();
+        int start = Math.min( prop.getStart(), headersLength );
+        int end = Math.min( prop.getEnd(), headersLength );
+
+        return new Range( headerStories.getStartOffset() + start,
+                headerStories.getStartOffset() + end, headerStories );
+    }
+
 	public Range getRange() {
 		return headerStories;
 	}
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectPoolImpl.java
@ -0,0 +1,34 @@
+package org.apache.poi.hwpf.usermodel;
+
+import java.io.FileNotFoundException;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.util.Internal;
+
+@Internal
+public class ObjectPoolImpl implements ObjectsPool
+{
+    private DirectoryEntry _objectPool;
+
+    public ObjectPoolImpl( DirectoryEntry _objectPool )
+    {
+        super();
+        this._objectPool = _objectPool;
+    }
+
+    public Entry getObjectById( String objId )
+    {
+        if ( _objectPool == null )
+            return null;
+
+        try
+        {
+            return _objectPool.getEntry( objId );
+        }
+        catch ( FileNotFoundException exc )
+        {
+            return null;
+        }
+    }
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/ObjectsPool.java
@ -0,0 +1,8 @@
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.poifs.filesystem.Entry;
+
+public interface ObjectsPool
+{
+    public Entry getObjectById( String objId );
+}
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
 import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
 import org.apache.poi.hwpf.sprm.SprmBuffer;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;

 /**
 * This class is the central class of the HWPF object model. All properties that
@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian;
 */
 public class Range { // TODO -instantiable superclass

+    private POILogger logger = POILogFactory.getLogger( Range.class );
+    
 	public static final int TYPE_PARAGRAPH = 0;
 	public static final int TYPE_CHARACTER = 1;
 	public static final int TYPE_SECTION = 2;
@ -888,9 +892,12 @@ public class Range { // TODO -instantiable superclass
        initAll();
        if ( tableEndInclusive >= this._parEnd )
        {
-            throw new ArrayIndexOutOfBoundsException(
-                    "The table's bounds fall outside of this Range" );
+            logger.log( POILogger.WARN, "The table's bounds ", "["
+                    + this._parStart + "; " + tableEndInclusive + ")",
+                    " fall outside of this Range paragraphs numbers ", "["
+                            + this._parStart + "; " + this._parEnd + ")" );
        }
+
        if ( tableEndInclusive < 0 )
        {
            throw new ArrayIndexOutOfBoundsException(
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/converter/TestWordToTextConverter.java
@ -0,0 +1,22 @@
+package org.apache.poi.hwpf.converter;
+
+import junit.framework.TestCase;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+public class TestWordToTextConverter extends TestCase
+{
+
+    /**
+     * [FAILING] Bug 47731 - Word Extractor considers text copied from some
+     * website as an embedded object
+     */
+    public void testBug47731() throws Exception
+    {
+        HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
+        String foundText = WordToTextConverter.getText( doc );
+
+        assertTrue( foundText
+                .contains( "Soak the rice in water for three to four hours" ) );
+    }
+}
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java
@ -33,6 +33,16 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 * @author Nick Burch (nick at torchbox dot com)
 */
 public final class TestWordExtractor extends TestCase {
+
+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
 	private String[] p_text1 = new String[] {
 			"This is a simple word document\r\n",
 			"\r\n",
@ -107,12 +117,14 @@ public final class TestWordExtractor extends TestCase {
 	public void testGetText() {
 		assertEquals(p_text1_block, extractor.getText());

-		// For the 2nd, should give similar answers for
-		//  the two methods, differing only in line endings
-		assertEquals(
-		      extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), 
-		      extractor2.getText().replaceAll("[\\r\\n]", ""));
-	}
+        // For the 2nd, should give similar answers for
+        // the two methods, differing only in line endings
+
+        // nope, they must have different results, because of garbage
+        // assertEquals(
+        // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
+        // extractor2.getText().replaceAll("[\\r\\n]", ""));
+    }

 	/**
 	 * Test textPieces based extraction
@ -330,7 +342,7 @@ public final class TestWordExtractor extends TestCase {
       
       // Open directly 
       for(DirectoryNode dir : files) {
-          WordExtractor extractor = new WordExtractor(dir, null);
+          WordExtractor extractor = new WordExtractor(dir);
          assertEquals(p_text1_block, extractor.getText());
       }

--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestBugs.java
@ -43,6 +43,15 @@ import org.apache.poi.util.IOUtils;
 public class TestBugs extends TestCase
 {

+    public static void assertEquals( String expected, String actual )
+    {
+        String newExpected = expected.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        String newActual = actual.replaceAll( "\r\n", "\n" )
+                .replaceAll( "\r", "\n" ).trim();
+        TestCase.assertEquals( newExpected, newActual );
+    }
+
    private static void assertTableStructures( Range expected, Range actual )
    {
        assertEquals( expected.numParagraphs(), actual.numParagraphs() );