mirror of https://github.com/apache/poi.git
Add Word-to-Text converter and use it as replacement for WordExtractor
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155336 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
888f51c566
commit
49697de696
|
@ -34,6 +34,7 @@
|
|||
|
||||
<changes>
|
||||
<release version="3.8-beta4" date="2011-??-??">
|
||||
<action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
|
||||
<action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
|
||||
<action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
|
||||
<action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.poi;
|
|||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
|
@ -39,7 +40,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||
public POIOLE2TextExtractor(POIDocument document) {
|
||||
super(document);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the document information metadata for the document
|
||||
*/
|
||||
|
@ -52,20 +53,28 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||
public SummaryInformation getSummaryInformation() {
|
||||
return document.getSummaryInformation();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an HPSF powered text extractor for the
|
||||
* Returns an HPSF powered text extractor for the
|
||||
* document properties metadata, such as title and author.
|
||||
*/
|
||||
public POITextExtractor getMetadataTextExtractor() {
|
||||
return new HPSFPropertiesExtractor(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the underlying POIFS FileSystem of
|
||||
* this document.
|
||||
*/
|
||||
public POIFSFileSystem getFileSystem() {
|
||||
return document.directory.getFileSystem();
|
||||
}
|
||||
public DirectoryEntry getRoot()
|
||||
{
|
||||
return document.directory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the underlying POIFS FileSystem of this document.
|
||||
*
|
||||
* @deprecated Use {@link #getRoot()} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public POIFSFileSystem getFileSystem()
|
||||
{
|
||||
return document.directory.getFileSystem();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,17 +61,27 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
*/
|
||||
public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
||||
private DirectoryNode _dir;
|
||||
private POIFSFileSystem _fs;
|
||||
boolean _includeSheetNames = true;
|
||||
boolean _formulasNotResults = false;
|
||||
|
||||
public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
|
||||
super(null);
|
||||
_dir = dir;
|
||||
_fs = fs;
|
||||
}
|
||||
/**
|
||||
* @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
@SuppressWarnings( "unused" )
|
||||
public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
|
||||
{
|
||||
this( dir );
|
||||
}
|
||||
|
||||
public EventBasedExcelExtractor( DirectoryNode dir )
|
||||
{
|
||||
super( null );
|
||||
_dir = dir;
|
||||
}
|
||||
|
||||
public EventBasedExcelExtractor(POIFSFileSystem fs) {
|
||||
this(fs.getRoot(), fs);
|
||||
this(fs.getRoot());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -79,9 +89,9 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
|||
* this document.
|
||||
*/
|
||||
public POIFSFileSystem getFileSystem() {
|
||||
return _fs;
|
||||
return _dir.getFileSystem();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Would return the document information metadata for the document,
|
||||
* if we supported it
|
||||
|
@ -200,7 +210,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
|||
outputNextStringValue = true;
|
||||
nextRow = frec.getRow();
|
||||
} else {
|
||||
thisText = _ft.formatNumberDateCell(frec);
|
||||
thisText = _ft.formatNumberDateCell(frec);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -234,7 +244,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
|
|||
case NumberRecord.sid:
|
||||
NumberRecord numrec = (NumberRecord) record;
|
||||
thisRow = numrec.getRow();
|
||||
thisText = _ft.formatNumberDateCell(numrec);
|
||||
thisText = _ft.formatNumberDateCell(numrec);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
|
@ -24,7 +24,6 @@ import java.io.InputStream;
|
|||
import java.io.PrintStream;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.ss.formula.eval.ErrorEval;
|
||||
import org.apache.poi.hssf.usermodel.HSSFCell;
|
||||
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
|
||||
import org.apache.poi.hssf.usermodel.HSSFComment;
|
||||
|
@ -35,12 +34,13 @@ import org.apache.poi.hssf.usermodel.HSSFSheet;
|
|||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.ss.formula.eval.ErrorEval;
|
||||
import org.apache.poi.ss.usermodel.HeaderFooter;
|
||||
|
||||
/**
|
||||
* A text extractor for Excel files.
|
||||
* <p>
|
||||
* Returns the textual content of the file, suitable for
|
||||
* Returns the textual content of the file, suitable for
|
||||
* indexing by something like Lucene, but not really
|
||||
* intended for display to the user.
|
||||
* </p>
|
||||
|
@ -59,19 +59,27 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
private boolean _includeCellComments = false;
|
||||
private boolean _includeBlankCells = false;
|
||||
private boolean _includeHeadersFooters = true;
|
||||
|
||||
|
||||
public ExcelExtractor(HSSFWorkbook wb) {
|
||||
super(wb);
|
||||
_wb = wb;
|
||||
_formatter = new HSSFDataFormatter();
|
||||
}
|
||||
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot(), fs);
|
||||
this(fs.getRoot());
|
||||
}
|
||||
public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HSSFWorkbook(dir, fs, true));
|
||||
/**
|
||||
* @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
@SuppressWarnings( "unused" )
|
||||
public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this( dir );
|
||||
}
|
||||
public ExcelExtractor(DirectoryNode dir) throws IOException {
|
||||
this(new HSSFWorkbook(dir, true));
|
||||
}
|
||||
|
||||
|
||||
private static final class CommandParseException extends Exception {
|
||||
public CommandParseException(String msg) {
|
||||
super(msg);
|
||||
|
@ -183,7 +191,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
return _headersFooters;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void printUsageMessage(PrintStream ps) {
|
||||
ps.println("Use:");
|
||||
ps.println(" " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
|
||||
|
@ -201,7 +209,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
* Command line extractor.
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
|
||||
|
||||
CommandArgs cmdArgs;
|
||||
try {
|
||||
cmdArgs = new CommandArgs(args);
|
||||
|
@ -211,12 +219,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
System.exit(1);
|
||||
return; // suppress compiler error
|
||||
}
|
||||
|
||||
|
||||
if (cmdArgs.isRequestHelp()) {
|
||||
printUsageMessage(System.out);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
InputStream is;
|
||||
if(cmdArgs.getInputFile() == null) {
|
||||
|
@ -270,9 +278,9 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
* Default is to include them.
|
||||
*/
|
||||
public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
|
||||
_includeHeadersFooters = includeHeadersFooters;
|
||||
_includeHeadersFooters = includeHeadersFooters;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the text contents of the file
|
||||
*/
|
||||
|
@ -282,12 +290,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
// We don't care about the difference between
|
||||
// null (missing) and blank cells
|
||||
_wb.setMissingCellPolicy(HSSFRow.RETURN_BLANK_AS_NULL);
|
||||
|
||||
|
||||
// Process each sheet in turn
|
||||
for(int i=0;i<_wb.getNumberOfSheets();i++) {
|
||||
HSSFSheet sheet = _wb.getSheetAt(i);
|
||||
if(sheet == null) { continue; }
|
||||
|
||||
|
||||
if(_includeSheetNames) {
|
||||
String name = _wb.getSheetName(i);
|
||||
if(name != null) {
|
||||
|
@ -295,12 +303,12 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
text.append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Header text, if there is any
|
||||
if(_includeHeadersFooters) {
|
||||
text.append(_extractHeaderFooter(sheet.getHeader()));
|
||||
}
|
||||
|
||||
|
||||
int firstRow = sheet.getFirstRowNum();
|
||||
int lastRow = sheet.getLastRowNum();
|
||||
for(int j=firstRow;j<=lastRow;j++) {
|
||||
|
@ -313,7 +321,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
if(_includeBlankCells) {
|
||||
firstCell = 0;
|
||||
}
|
||||
|
||||
|
||||
for(int k=firstCell;k<lastCell;k++) {
|
||||
HSSFCell cell = row.getCell(k);
|
||||
boolean outputContents = true;
|
||||
|
@ -368,14 +376,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
case HSSFCell.CELL_TYPE_ERROR:
|
||||
text.append(ErrorEval.getText(cell.getErrorCellValue()));
|
||||
break;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
|
||||
}
|
||||
|
||||
|
||||
// Output the comment, if requested and exists
|
||||
HSSFComment comment = cell.getCellComment();
|
||||
if(_includeCellComments && comment != null) {
|
||||
|
@ -385,29 +393,29 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
text.append(" Comment by "+comment.getAuthor()+": "+commentText);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Output a tab if we're not on the last cell
|
||||
if(outputContents && k < (lastCell-1)) {
|
||||
text.append("\t");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Finish off the row
|
||||
text.append("\n");
|
||||
}
|
||||
|
||||
|
||||
// Finally Footer text, if there is any
|
||||
if(_includeHeadersFooters) {
|
||||
text.append(_extractHeaderFooter(sheet.getFooter()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
|
||||
public static String _extractHeaderFooter(HeaderFooter hf) {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
|
||||
if(hf.getLeft() != null) {
|
||||
text.append(hf.getLeft());
|
||||
}
|
||||
|
@ -423,7 +431,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
}
|
||||
if(text.length() > 0)
|
||||
text.append("\n");
|
||||
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,13 +15,14 @@
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
|
||||
|
||||
package org.apache.poi.poifs.filesystem;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.hpsf.ClassID;
|
||||
|
||||
|
@ -67,6 +68,12 @@ public interface DirectoryEntry
|
|||
|
||||
public int getEntryCount();
|
||||
|
||||
/**
|
||||
* Checks if entry with specified name present
|
||||
*/
|
||||
|
||||
public boolean hasEntry( final String name );
|
||||
|
||||
/**
|
||||
* get a specified Entry by name
|
||||
*
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
|
||||
|
||||
package org.apache.poi.poifs.filesystem;
|
||||
|
||||
|
@ -53,7 +53,7 @@ public class DirectoryNode
|
|||
// the POIFSFileSystem we belong to
|
||||
private POIFSFileSystem _ofilesystem;
|
||||
// the NPOIFSFileSytem we belong to
|
||||
private NPOIFSFileSystem _nfilesystem;
|
||||
private NPOIFSFileSystem _nfilesystem;
|
||||
|
||||
// the path described by this document
|
||||
private POIFSDocumentPath _path;
|
||||
|
@ -72,7 +72,7 @@ public class DirectoryNode
|
|||
{
|
||||
this(property, parent, filesystem, (NPOIFSFileSystem)null);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* create a DirectoryNode. This method is not public by design; it
|
||||
* is intended strictly for the internal use of this package
|
||||
|
@ -87,7 +87,7 @@ public class DirectoryNode
|
|||
{
|
||||
this(property, parent, (POIFSFileSystem)null, nfilesystem);
|
||||
}
|
||||
|
||||
|
||||
private DirectoryNode(final DirectoryProperty property,
|
||||
final DirectoryNode parent,
|
||||
final POIFSFileSystem ofilesystem,
|
||||
|
@ -96,7 +96,7 @@ public class DirectoryNode
|
|||
super(property, parent);
|
||||
this._ofilesystem = ofilesystem;
|
||||
this._nfilesystem = nfilesystem;
|
||||
|
||||
|
||||
if (parent == null)
|
||||
{
|
||||
_path = new POIFSDocumentPath();
|
||||
|
@ -143,23 +143,23 @@ public class DirectoryNode
|
|||
{
|
||||
return _path;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the filesystem that this belongs to
|
||||
*/
|
||||
public POIFSFileSystem getFileSystem()
|
||||
{
|
||||
return _ofilesystem;
|
||||
return _ofilesystem;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the filesystem that this belongs to
|
||||
*/
|
||||
public NPOIFSFileSystem getNFileSystem()
|
||||
{
|
||||
return _nfilesystem;
|
||||
return _nfilesystem;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* open a document in the directory's entry's list of entries
|
||||
*
|
||||
|
@ -195,7 +195,7 @@ public class DirectoryNode
|
|||
throw new IOException("Entry '" + document.getName()
|
||||
+ "' is not a DocumentEntry");
|
||||
}
|
||||
|
||||
|
||||
DocumentEntry entry = (DocumentEntry)document;
|
||||
return new DocumentInputStream(entry);
|
||||
}
|
||||
|
@ -217,7 +217,7 @@ public class DirectoryNode
|
|||
|
||||
(( DirectoryProperty ) getProperty()).addChild(property);
|
||||
_ofilesystem.addDocument(document);
|
||||
|
||||
|
||||
_entries.add(rval);
|
||||
_byname.put(property.getName(), rval);
|
||||
return rval;
|
||||
|
@ -240,7 +240,7 @@ public class DirectoryNode
|
|||
|
||||
(( DirectoryProperty ) getProperty()).addChild(property);
|
||||
_nfilesystem.addDocument(document);
|
||||
|
||||
|
||||
_entries.add(rval);
|
||||
_byname.put(property.getName(), rval);
|
||||
return rval;
|
||||
|
@ -290,7 +290,7 @@ public class DirectoryNode
|
|||
{
|
||||
_entries.remove(entry);
|
||||
_byname.remove(entry.getName());
|
||||
|
||||
|
||||
if(_ofilesystem != null) {
|
||||
_ofilesystem.remove(entry);
|
||||
} else {
|
||||
|
@ -342,6 +342,11 @@ public class DirectoryNode
|
|||
return _entries.size();
|
||||
}
|
||||
|
||||
public boolean hasEntry( String name )
|
||||
{
|
||||
return name != null && _byname.containsKey( name );
|
||||
}
|
||||
|
||||
/**
|
||||
* get a specified Entry by name
|
||||
*
|
||||
|
@ -430,7 +435,7 @@ public class DirectoryNode
|
|||
{
|
||||
DirectoryNode rval;
|
||||
DirectoryProperty property = new DirectoryProperty(name);
|
||||
|
||||
|
||||
if(_ofilesystem != null) {
|
||||
rval = new DirectoryNode(property, _ofilesystem, this);
|
||||
_ofilesystem.addDirectory(property);
|
||||
|
@ -562,7 +567,7 @@ public class DirectoryNode
|
|||
* Returns an Iterator over all the entries
|
||||
*/
|
||||
public Iterator<Entry> iterator() {
|
||||
return getEntries();
|
||||
return getEntries();
|
||||
}
|
||||
|
||||
/* ********** END begin implementation of POIFSViewable ********** */
|
||||
|
|
|
@ -66,48 +66,48 @@ import org.apache.xmlbeans.XmlException;
|
|||
public class ExtractorFactory {
|
||||
public static final String CORE_DOCUMENT_REL =
|
||||
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
|
||||
|
||||
|
||||
|
||||
|
||||
/** Should this thread prefer event based over usermodel based extractors? */
|
||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
||||
protected Boolean initialValue() { return Boolean.FALSE; }
|
||||
};
|
||||
/** Should all threads prefer event based over usermodel based extractors? */
|
||||
private static Boolean allPreferEventExtractors;
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
*/
|
||||
public static boolean getThreadPrefersEventExtractors() {
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
*/
|
||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||
return allPreferEventExtractors;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* Will only be used if the All Threads setting is null.
|
||||
* Will only be used if the All Threads setting is null.
|
||||
*/
|
||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||
threadPreferEventExtractors.set(preferEventExtractors);
|
||||
}
|
||||
/**
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* If set, will take preference over the Thread level setting.
|
||||
* If set, will take preference over the Thread level setting.
|
||||
*/
|
||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||
allPreferEventExtractors = preferEventExtractors;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Should this thread use event based extractors is available?
|
||||
* Checks the all-threads one first, then thread specific.
|
||||
|
@ -118,8 +118,8 @@ public class ExtractorFactory {
|
|||
}
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
InputStream inp = null;
|
||||
try {
|
||||
|
@ -137,14 +137,14 @@ public class ExtractorFactory {
|
|||
if(inp != null) inp.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
// Figure out the kind of stream
|
||||
// If clearly doesn't do mark/reset, wrap up
|
||||
if(! inp.markSupported()) {
|
||||
inp = new PushbackInputStream(inp, 8);
|
||||
}
|
||||
|
||||
|
||||
if(POIFSFileSystem.hasPOIFSHeader(inp)) {
|
||||
return createExtractor(new POIFSFileSystem(inp));
|
||||
}
|
||||
|
@ -153,16 +153,16 @@ public class ExtractorFactory {
|
|||
}
|
||||
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
|
||||
}
|
||||
|
||||
|
||||
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
||||
PackageRelationshipCollection core =
|
||||
PackageRelationshipCollection core =
|
||||
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
||||
if(core.size() != 1) {
|
||||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
||||
}
|
||||
|
||||
PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
||||
|
||||
|
||||
// Is it XSSF?
|
||||
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
|
@ -173,84 +173,98 @@ public class ExtractorFactory {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Is it XWPF?
|
||||
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
return new XWPFWordExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Is it XSLF?
|
||||
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
|
||||
if(corePart.getContentType().equals(rel.getContentType())) {
|
||||
return new XSLFPowerPointExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
|
||||
}
|
||||
|
||||
|
||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
// Only ever an OLE2 one from the root of the FS
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
|
||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||
}
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
// Look for certain entries in the stream, to figure it
|
||||
// out from
|
||||
for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
|
||||
Entry entry = entries.next();
|
||||
|
||||
if(entry.getName().equals("Workbook")) {
|
||||
if(getPreferEventExtractor()) {
|
||||
return new EventBasedExcelExtractor(poifsDir, fs);
|
||||
} else {
|
||||
return new ExcelExtractor(poifsDir, fs);
|
||||
}
|
||||
}
|
||||
if(entry.getName().equals("WordDocument")) {
|
||||
// Old or new style word document?
|
||||
try {
|
||||
return new WordExtractor(poifsDir, fs);
|
||||
} catch(OldWordFileFormatException e) {
|
||||
return new Word6Extractor(poifsDir, fs);
|
||||
}
|
||||
}
|
||||
if(entry.getName().equals("PowerPoint Document")) {
|
||||
return new PowerPointExtractor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("VisioDocument")) {
|
||||
return new VisioTextExtractor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("Quill")) {
|
||||
return new PublisherTextExtractor(poifsDir, fs);
|
||||
}
|
||||
if(
|
||||
entry.getName().equals("__substg1.0_1000001E") ||
|
||||
entry.getName().equals("__substg1.0_1000001F") ||
|
||||
entry.getName().equals("__substg1.0_0047001E") ||
|
||||
entry.getName().equals("__substg1.0_0047001F") ||
|
||||
entry.getName().equals("__substg1.0_0037001E") ||
|
||||
entry.getName().equals("__substg1.0_0037001F")
|
||||
) {
|
||||
return new OutlookTextExtactor(poifsDir, fs);
|
||||
}
|
||||
if(entry.getName().equals("Package")) {
|
||||
OPCPackage pkg = OPCPackage.open(
|
||||
poifsDir.createDocumentInputStream(entry.getName())
|
||||
);
|
||||
return createExtractor(pkg);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #createExtractor(DirectoryNode)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
@SuppressWarnings("unused")
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
|
||||
throws IOException, InvalidFormatException, OpenXML4JException, XmlException
|
||||
{
|
||||
return createExtractor(poifsDir);
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
|
||||
InvalidFormatException, OpenXML4JException, XmlException
|
||||
{
|
||||
// Look for certain entries in the stream, to figure it
|
||||
// out from
|
||||
if (poifsDir.hasEntry("Workbook")) {
|
||||
if (getPreferEventExtractor()) {
|
||||
return new EventBasedExcelExtractor(poifsDir);
|
||||
}
|
||||
return new ExcelExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("WordDocument")) {
|
||||
// Old or new style word document?
|
||||
try {
|
||||
return new WordExtractor(poifsDir);
|
||||
} catch (OldWordFileFormatException e) {
|
||||
return new Word6Extractor(poifsDir);
|
||||
}
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("PowerPoint Document")) {
|
||||
return new PowerPointExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("VisioDocument")) {
|
||||
return new VisioTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("Quill")) {
|
||||
return new PublisherTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
|
||||
|| poifsDir.hasEntry("__substg1.0_0047001E")
|
||||
|| poifsDir.hasEntry("__substg1.0_0047001F")
|
||||
|| poifsDir.hasEntry("__substg1.0_0037001E")
|
||||
|| poifsDir.hasEntry("__substg1.0_0037001F"))
|
||||
{
|
||||
return new OutlookTextExtactor(poifsDir);
|
||||
}
|
||||
|
||||
for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
|
||||
Entry entry = entries.next();
|
||||
|
||||
if (entry.getName().equals("Package")) {
|
||||
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
||||
return createExtractor(pkg);
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embeded documents in the file (if there are any).
|
||||
* If there are no embeded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embeded file.
|
||||
*/
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||
|
@ -258,16 +272,16 @@ public class ExtractorFactory {
|
|||
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
||||
// For anything else not directly held in as a POIFS directory
|
||||
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
||||
|
||||
|
||||
// Find all the embeded directories
|
||||
POIFSFileSystem fs = ext.getFileSystem();
|
||||
if(fs == null) {
|
||||
DirectoryEntry root = ext.getRoot();
|
||||
if(root == null) {
|
||||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||
}
|
||||
|
||||
|
||||
if(ext instanceof ExcelExtractor) {
|
||||
// These are in MBD... under the root
|
||||
Iterator<Entry> it = fs.getRoot().getEntries();
|
||||
Iterator<Entry> it = root.getEntries();
|
||||
while(it.hasNext()) {
|
||||
Entry entry = it.next();
|
||||
if(entry.getName().startsWith("MBD")) {
|
||||
|
@ -278,7 +292,7 @@ public class ExtractorFactory {
|
|||
// These are in ObjectPool -> _... under the root
|
||||
try {
|
||||
DirectoryEntry op = (DirectoryEntry)
|
||||
fs.getRoot().getEntry("ObjectPool");
|
||||
root.getEntry("ObjectPool");
|
||||
Iterator<Entry> it = op.getEntries();
|
||||
while(it.hasNext()) {
|
||||
Entry entry = it.next();
|
||||
|
@ -302,7 +316,7 @@ public class ExtractorFactory {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Create the extractors
|
||||
if(
|
||||
(dirs == null || dirs.size() == 0) &&
|
||||
|
@ -310,11 +324,11 @@ public class ExtractorFactory {
|
|||
){
|
||||
return new POITextExtractor[0];
|
||||
}
|
||||
|
||||
|
||||
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
|
||||
for(int i=0; i<dirs.size(); i++) {
|
||||
e.add( createExtractor(
|
||||
(DirectoryNode)dirs.get(i), ext.getFileSystem()
|
||||
(DirectoryNode)dirs.get(i)
|
||||
) );
|
||||
}
|
||||
for(int i=0; i<nonPOIFS.size(); i++) {
|
||||
|
@ -336,7 +350,7 @@ public class ExtractorFactory {
|
|||
* Returns an array of text extractors, one for each of
|
||||
* the embeded documents in the file (if there are any).
|
||||
* If there are no embeded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embeded file.
|
||||
*/
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
|
||||
import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
|
||||
|
||||
import org.apache.poi.hwpf.model.BookmarksTables;
|
||||
import org.apache.poi.hwpf.model.CHPBinTable;
|
||||
import org.apache.poi.hwpf.model.CPSplitCalculator;
|
||||
|
@ -190,7 +192,9 @@ public final class HWPFDocument extends HWPFDocumentCore
|
|||
* @param pfilesystem The POIFSFileSystem that contains the Word document.
|
||||
* @throws IOException If there is an unexpected IOException from the passed
|
||||
* in POIFSFileSystem.
|
||||
* @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
|
||||
{
|
||||
this(directory);
|
||||
|
|
|
@ -17,10 +17,17 @@
|
|||
|
||||
package org.apache.poi.hwpf;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PushbackInputStream;
|
||||
|
||||
import org.apache.poi.hwpf.usermodel.ObjectsPool;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
|
||||
import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.hwpf.model.CHPBinTable;
|
||||
|
@ -46,6 +53,9 @@ import org.apache.poi.util.Internal;
|
|||
*/
|
||||
public abstract class HWPFDocumentCore extends POIDocument
|
||||
{
|
||||
/** Holds OLE2 objects */
|
||||
protected ObjectPoolImpl _objectPool;
|
||||
|
||||
/** The FIB */
|
||||
protected FileInformationBlock _fib;
|
||||
|
||||
|
@ -148,7 +158,21 @@ public abstract class HWPFDocumentCore extends POIDocument
|
|||
if(_fib.isFEncrypted()) {
|
||||
throw new EncryptedDocumentException("Cannot process encrypted word files!");
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
DirectoryEntry objectPoolEntry;
|
||||
try
|
||||
{
|
||||
objectPoolEntry = (DirectoryEntry) directory
|
||||
.getEntry( "ObjectPool" );
|
||||
}
|
||||
catch ( FileNotFoundException exc )
|
||||
{
|
||||
objectPoolEntry = directory.createDirectory( "ObjectPool" );
|
||||
}
|
||||
_objectPool = new ObjectPoolImpl( objectPoolEntry );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the range which covers the whole of the document, but excludes
|
||||
|
@ -211,5 +235,10 @@ public abstract class HWPFDocumentCore extends POIDocument
|
|||
return _fib;
|
||||
}
|
||||
|
||||
public ObjectsPool getObjectsPool()
|
||||
{
|
||||
return _objectPool;
|
||||
}
|
||||
|
||||
public abstract TextPieceTable getTextTable();
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||
this(fs.getRoot());
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
|
||||
throws IOException {
|
||||
this(directory);
|
||||
|
|
|
@ -47,6 +47,7 @@ import org.apache.poi.hwpf.usermodel.Section;
|
|||
import org.apache.poi.hwpf.usermodel.Table;
|
||||
import org.apache.poi.hwpf.usermodel.TableCell;
|
||||
import org.apache.poi.hwpf.usermodel.TableRow;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.util.Beta;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
@ -56,6 +57,32 @@ import org.w3c.dom.Element;
|
|||
@Beta
|
||||
public abstract class AbstractWordConverter
|
||||
{
|
||||
private static final class Structure implements Comparable<Structure>
|
||||
{
|
||||
final int end;
|
||||
final int start;
|
||||
final Object structure;
|
||||
|
||||
Structure( Bookmark bookmark )
|
||||
{
|
||||
this.start = bookmark.getStart();
|
||||
this.end = bookmark.getEnd();
|
||||
this.structure = bookmark;
|
||||
}
|
||||
|
||||
Structure( Field field )
|
||||
{
|
||||
this.start = field.getFieldStartOffset();
|
||||
this.end = field.getFieldEndOffset();
|
||||
this.structure = field;
|
||||
}
|
||||
|
||||
public int compareTo( Structure o )
|
||||
{
|
||||
return start < o.start ? -1 : start == o.start ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
private static final byte BEL_MARK = 7;
|
||||
|
||||
private static final byte FIELD_BEGIN_MARK = 19;
|
||||
|
@ -396,6 +423,13 @@ public abstract class AbstractWordConverter
|
|||
processDrawnObject( doc, characterRun, block );
|
||||
continue;
|
||||
}
|
||||
if ( characterRun.isOle2()
|
||||
&& ( wordDocument instanceof HWPFDocument ) )
|
||||
{
|
||||
HWPFDocument doc = (HWPFDocument) wordDocument;
|
||||
processOle2( doc, characterRun, block );
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
|
||||
|
@ -613,10 +647,11 @@ public abstract class AbstractWordConverter
|
|||
CharacterRun characterRun, OfficeDrawing officeDrawing,
|
||||
String path, Element block );
|
||||
|
||||
protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range endnoteTextRange );
|
||||
protected abstract void processEndnoteAutonumbered(
|
||||
HWPFDocument wordDocument, int noteIndex, Element block,
|
||||
Range endnoteTextRange );
|
||||
|
||||
protected void processField( HWPFDocument hwpfDocument, Range parentRange,
|
||||
protected void processField( HWPFDocument wordDocument, Range parentRange,
|
||||
int currentTableLevel, Field field, Element currentBlock )
|
||||
{
|
||||
switch ( field.getType() )
|
||||
|
@ -633,7 +668,7 @@ public abstract class AbstractWordConverter
|
|||
if ( matcher.find() )
|
||||
{
|
||||
String pageref = matcher.group( 1 );
|
||||
processPageref( hwpfDocument, currentBlock,
|
||||
processPageref( wordDocument, currentBlock,
|
||||
field.secondSubrange( parentRange ),
|
||||
currentTableLevel, pageref );
|
||||
return;
|
||||
|
@ -641,6 +676,36 @@ public abstract class AbstractWordConverter
|
|||
}
|
||||
break;
|
||||
}
|
||||
case 58: // Embedded Object
|
||||
{
|
||||
if ( !field.hasSeparator() )
|
||||
{
|
||||
logger.log( POILogger.WARN, parentRange + " contains " + field
|
||||
+ " with 'Embedded Object' but without separator mark" );
|
||||
return;
|
||||
}
|
||||
|
||||
CharacterRun separator = field
|
||||
.getMarkSeparatorCharacterRun( parentRange );
|
||||
|
||||
if ( separator.isOle2() )
|
||||
{
|
||||
// the only supported so far
|
||||
boolean processed = processOle2( wordDocument, separator,
|
||||
currentBlock );
|
||||
|
||||
// if we didn't output OLE - output field value
|
||||
if ( !processed )
|
||||
{
|
||||
processCharacters( wordDocument, currentTableLevel,
|
||||
field.secondSubrange( parentRange ), currentBlock );
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case 88: // hyperlink
|
||||
{
|
||||
final Range firstSubrange = field.firstSubrange( parentRange );
|
||||
|
@ -653,7 +718,7 @@ public abstract class AbstractWordConverter
|
|||
if ( matcher.find() )
|
||||
{
|
||||
String hyperlink = matcher.group( 1 );
|
||||
processHyperlink( hwpfDocument, currentBlock,
|
||||
processHyperlink( wordDocument, currentBlock,
|
||||
field.secondSubrange( parentRange ),
|
||||
currentTableLevel, hyperlink );
|
||||
return;
|
||||
|
@ -665,12 +730,13 @@ public abstract class AbstractWordConverter
|
|||
|
||||
logger.log( POILogger.WARN, parentRange + " contains " + field
|
||||
+ " with unsupported type or format" );
|
||||
processCharacters( hwpfDocument, currentTableLevel,
|
||||
processCharacters( wordDocument, currentTableLevel,
|
||||
field.secondSubrange( parentRange ), currentBlock );
|
||||
}
|
||||
|
||||
protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range footnoteTextRange );
|
||||
protected abstract void processFootnoteAutonumbered(
|
||||
HWPFDocument wordDocument, int noteIndex, Element block,
|
||||
Range footnoteTextRange );
|
||||
|
||||
protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
|
||||
Element currentBlock, Range textRange, int currentTableLevel,
|
||||
|
@ -732,6 +798,40 @@ public abstract class AbstractWordConverter
|
|||
}
|
||||
}
|
||||
|
||||
private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
|
||||
Element block )
|
||||
{
|
||||
Entry entry = doc.getObjectsPool().getObjectById(
|
||||
"_" + characterRun.getPicOffset() );
|
||||
if ( entry == null )
|
||||
{
|
||||
logger.log( POILogger.WARN, "Referenced OLE2 object '",
|
||||
Integer.valueOf( characterRun.getPicOffset() ),
|
||||
"' not found in ObjectPool" );
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
return processOle2( doc, block, entry );
|
||||
}
|
||||
catch ( Exception exc )
|
||||
{
|
||||
logger.log( POILogger.WARN,
|
||||
"Unable to convert internal OLE2 object '",
|
||||
Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
|
||||
exc );
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings( "unused" )
|
||||
protected boolean processOle2( HWPFDocument wordDocument, Element block,
|
||||
Entry entry ) throws Exception
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
protected abstract void processPageref( HWPFDocumentCore wordDocument,
|
||||
Element currentBlock, Range textRange, int currentTableLevel,
|
||||
String pageref );
|
||||
|
@ -896,30 +996,4 @@ public abstract class AbstractWordConverter
|
|||
return endMark;
|
||||
}
|
||||
|
||||
private static final class Structure implements Comparable<Structure>
|
||||
{
|
||||
final int end;
|
||||
final int start;
|
||||
final Object structure;
|
||||
|
||||
Structure( Bookmark bookmark )
|
||||
{
|
||||
this.start = bookmark.getStart();
|
||||
this.end = bookmark.getEnd();
|
||||
this.structure = bookmark;
|
||||
}
|
||||
|
||||
Structure( Field field )
|
||||
{
|
||||
this.start = field.getFieldStartOffset();
|
||||
this.end = field.getFieldEndOffset();
|
||||
this.structure = field;
|
||||
}
|
||||
|
||||
public int compareTo( Structure o )
|
||||
{
|
||||
return start < o.start ? -1 : start == o.start ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.poi.hwpf.usermodel.Paragraph;
|
|||
import org.apache.poi.hwpf.usermodel.Table;
|
||||
import org.apache.poi.hwpf.usermodel.TableCell;
|
||||
import org.apache.poi.hwpf.usermodel.TableRow;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.Beta;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
|
@ -422,6 +423,19 @@ public class AbstractWordUtils
|
|||
return !isEmpty( str );
|
||||
}
|
||||
|
||||
public static HWPFDocumentCore loadDoc( final DirectoryNode root )
|
||||
throws IOException
|
||||
{
|
||||
try
|
||||
{
|
||||
return new HWPFDocument( root );
|
||||
}
|
||||
catch ( OldWordFileFormatException exc )
|
||||
{
|
||||
return new HWPFOldDocument( root );
|
||||
}
|
||||
}
|
||||
|
||||
public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
|
||||
{
|
||||
final FileInputStream istream = new FileInputStream( docFile );
|
||||
|
@ -438,16 +452,13 @@ public class AbstractWordUtils
|
|||
public static HWPFDocumentCore loadDoc( InputStream inputStream )
|
||||
throws IOException
|
||||
{
|
||||
final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
|
||||
.verifyAndBuildPOIFS( inputStream );
|
||||
try
|
||||
{
|
||||
return new HWPFDocument( poifsFileSystem );
|
||||
}
|
||||
catch ( OldWordFileFormatException exc )
|
||||
{
|
||||
return new HWPFOldDocument( poifsFileSystem );
|
||||
}
|
||||
return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
|
||||
}
|
||||
|
||||
public static HWPFDocumentCore loadDoc(
|
||||
final POIFSFileSystem poifsFileSystem ) throws IOException
|
||||
{
|
||||
return loadDoc( poifsFileSystem.getRoot() );
|
||||
}
|
||||
|
||||
static String substringBeforeLast( String str, String separator )
|
||||
|
|
|
@ -276,8 +276,8 @@ public class WordToFoConverter extends AbstractWordConverter
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
|
||||
Element block, Range endnoteTextRange )
|
||||
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range endnoteTextRange )
|
||||
{
|
||||
final String textIndex = String.valueOf( internalLinkCounter
|
||||
.incrementAndGet() );
|
||||
|
@ -297,7 +297,8 @@ public class WordToFoConverter extends AbstractWordConverter
|
|||
setId( backwardLink, forwardLinkName );
|
||||
endnote.appendChild( backwardLink );
|
||||
|
||||
processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
|
||||
processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
|
||||
endnote );
|
||||
|
||||
WordToFoUtils.compactInlines( endnote );
|
||||
this.endnotes.add( endnote );
|
||||
|
|
|
@ -63,7 +63,6 @@ import static org.apache.poi.hwpf.converter.AbstractWordUtils.TWIPS_PER_INCH;
|
|||
@Beta
|
||||
public class WordToHtmlConverter extends AbstractWordConverter
|
||||
{
|
||||
|
||||
/**
|
||||
* Holds properties values, applied to current <tt>p</tt> element. Those
|
||||
* properties shall not be doubled in children <tt>span</tt> elements.
|
||||
|
@ -282,10 +281,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
|
||||
Element block, Range endnoteTextRange )
|
||||
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range endnoteTextRange )
|
||||
{
|
||||
processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
|
||||
processNoteAutonumbered( wordDocument, "end", noteIndex, block,
|
||||
endnoteTextRange );
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -2,10 +2,14 @@ package org.apache.poi.hwpf.converter;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
|
@ -25,6 +29,8 @@ import org.apache.poi.hwpf.usermodel.Section;
|
|||
import org.apache.poi.hwpf.usermodel.Table;
|
||||
import org.apache.poi.hwpf.usermodel.TableCell;
|
||||
import org.apache.poi.hwpf.usermodel.TableRow;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.util.Beta;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
|
@ -33,6 +39,29 @@ import org.w3c.dom.Element;
|
|||
public class WordToTextConverter extends AbstractWordConverter
|
||||
{
|
||||
|
||||
public static String getText( DirectoryNode root ) throws Exception
|
||||
{
|
||||
final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
|
||||
return getText( wordDocument );
|
||||
}
|
||||
|
||||
public static String getText( File docFile ) throws Exception
|
||||
{
|
||||
final HWPFDocumentCore wordDocument = AbstractWordUtils
|
||||
.loadDoc( docFile );
|
||||
return getText( wordDocument );
|
||||
}
|
||||
|
||||
public static String getText( final HWPFDocumentCore wordDocument )
|
||||
throws Exception
|
||||
{
|
||||
WordToTextConverter wordToTextConverter = new WordToTextConverter(
|
||||
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||
.newDocument() );
|
||||
wordToTextConverter.processDocument( wordDocument );
|
||||
return wordToTextConverter.getText();
|
||||
}
|
||||
|
||||
/**
|
||||
* Java main() interface to interact with {@link WordToTextConverter}
|
||||
*
|
||||
|
@ -91,8 +120,24 @@ public class WordToTextConverter extends AbstractWordConverter
|
|||
|
||||
private Element notes = null;
|
||||
|
||||
private boolean outputSummaryInformation = false;
|
||||
|
||||
private final TextDocumentFacade textDocumentFacade;
|
||||
|
||||
/**
|
||||
* Creates new instance of {@link WordToTextConverter}. Can be used for
|
||||
* output several {@link HWPFDocument}s into single text document.
|
||||
*
|
||||
* @throws ParserConfigurationException
|
||||
* if an internal {@link DocumentBuilder} cannot be created
|
||||
*/
|
||||
public WordToTextConverter() throws ParserConfigurationException
|
||||
{
|
||||
this.textDocumentFacade = new TextDocumentFacade(
|
||||
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||
.newDocument() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates new instance of {@link WordToTextConverter}. Can be used for
|
||||
* output several {@link HWPFDocument}s into single text document.
|
||||
|
@ -110,6 +155,28 @@ public class WordToTextConverter extends AbstractWordConverter
|
|||
return textDocumentFacade.getDocument();
|
||||
}
|
||||
|
||||
public String getText() throws Exception
|
||||
{
|
||||
StringWriter stringWriter = new StringWriter();
|
||||
DOMSource domSource = new DOMSource( getDocument() );
|
||||
StreamResult streamResult = new StreamResult( stringWriter );
|
||||
|
||||
TransformerFactory tf = TransformerFactory.newInstance();
|
||||
Transformer serializer = tf.newTransformer();
|
||||
// TODO set encoding from a command argument
|
||||
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
|
||||
serializer.setOutputProperty( OutputKeys.INDENT, "no" );
|
||||
serializer.setOutputProperty( OutputKeys.METHOD, "text" );
|
||||
serializer.transform( domSource, streamResult );
|
||||
|
||||
return stringWriter.toString();
|
||||
}
|
||||
|
||||
public boolean isOutputSummaryInformation()
|
||||
{
|
||||
return outputSummaryInformation;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void outputCharacters( Element block, CharacterRun characterRun,
|
||||
String text )
|
||||
|
@ -138,18 +205,24 @@ public class WordToTextConverter extends AbstractWordConverter
|
|||
protected void processDocumentInformation(
|
||||
SummaryInformation summaryInformation )
|
||||
{
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
|
||||
textDocumentFacade.setTitle( summaryInformation.getTitle() );
|
||||
if ( isOutputSummaryInformation() )
|
||||
{
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
|
||||
textDocumentFacade.setTitle( summaryInformation.getTitle() );
|
||||
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
|
||||
textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
|
||||
textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
|
||||
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
|
||||
textDocumentFacade
|
||||
.addDescription( summaryInformation.getComments() );
|
||||
if ( AbstractWordUtils
|
||||
.isNotEmpty( summaryInformation.getComments() ) )
|
||||
textDocumentFacade.addDescription( summaryInformation
|
||||
.getComments() );
|
||||
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
|
||||
textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
|
||||
if ( AbstractWordUtils
|
||||
.isNotEmpty( summaryInformation.getKeywords() ) )
|
||||
textDocumentFacade.addKeywords( summaryInformation
|
||||
.getKeywords() );
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -222,6 +295,48 @@ public class WordToTextConverter extends AbstractWordConverter
|
|||
note.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean processOle2( HWPFDocument wordDocument, Element block,
|
||||
Entry entry ) throws Exception
|
||||
{
|
||||
if ( !( entry instanceof DirectoryNode ) )
|
||||
return false;
|
||||
DirectoryNode directoryNode = (DirectoryNode) entry;
|
||||
|
||||
// even if no ExtractorFactory in classpath
|
||||
if ( directoryNode.hasEntry( "WordDocument" ) )
|
||||
{
|
||||
String text = WordToTextConverter.getText( (DirectoryNode) entry );
|
||||
block.appendChild( textDocumentFacade
|
||||
.createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
|
||||
+ UNICODECHAR_ZERO_WIDTH_SPACE ) );
|
||||
return true;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
Class<?> cls = Class
|
||||
.forName( "org.apache.poi.extractor.ExtractorFactory" );
|
||||
Method createExtractor = cls.getMethod( "createExtractor",
|
||||
DirectoryNode.class );
|
||||
Object extractor = createExtractor.invoke( null, directoryNode );
|
||||
|
||||
Method getText = extractor.getClass().getMethod( "getText" );
|
||||
String text = (String) getText.invoke( extractor );
|
||||
|
||||
block.appendChild( textDocumentFacade
|
||||
.createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
|
||||
+ UNICODECHAR_ZERO_WIDTH_SPACE ) );
|
||||
return true;
|
||||
}
|
||||
catch ( ClassNotFoundException exc )
|
||||
{
|
||||
// no extractor in classpath
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processPageref( HWPFDocumentCore wordDocument,
|
||||
Element currentBlock, Range textRange, int currentTableLevel,
|
||||
|
@ -254,7 +369,7 @@ public class WordToTextConverter extends AbstractWordConverter
|
|||
textDocumentFacade.body.appendChild( sectionElement );
|
||||
}
|
||||
|
||||
protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
|
||||
protected void processTable( HWPFDocumentCore wordDocument, Element flow,
|
||||
Table table )
|
||||
{
|
||||
final int tableRows = table.numRows();
|
||||
|
@ -275,8 +390,8 @@ public class WordToTextConverter extends AbstractWordConverter
|
|||
tableCellElement.appendChild( textDocumentFacade
|
||||
.createText( "\t" ) );
|
||||
|
||||
processParagraphes( hwpfDocument, tableCellElement, tableCell,
|
||||
table.getTableLevel() );
|
||||
processCharacters( wordDocument, table.getTableLevel(),
|
||||
tableCell, tableCellElement );
|
||||
tableRowElement.appendChild( tableCellElement );
|
||||
}
|
||||
|
||||
|
@ -285,4 +400,9 @@ public class WordToTextConverter extends AbstractWordConverter
|
|||
}
|
||||
}
|
||||
|
||||
public void setOutputSummaryInformation( boolean outputDocumentInformation )
|
||||
{
|
||||
this.outputSummaryInformation = outputDocumentInformation;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -19,6 +19,10 @@ package org.apache.poi.hwpf.extractor;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
|
||||
import org.apache.poi.hwpf.converter.WordToTextConverter;
|
||||
import org.apache.poi.hwpf.usermodel.HeaderStories;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||
|
@ -47,16 +51,32 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
|||
this( new POIFSFileSystem(is) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param fs POIFSFileSystem containing the word file
|
||||
*/
|
||||
public Word6Extractor(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HWPFOldDocument(dir,fs));
|
||||
}
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
*
|
||||
* @param fs
|
||||
* POIFSFileSystem containing the word file
|
||||
*/
|
||||
public Word6Extractor( POIFSFileSystem fs ) throws IOException
|
||||
{
|
||||
this( fs.getRoot() );
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
@SuppressWarnings( "unused" )
|
||||
public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
|
||||
throws IOException
|
||||
{
|
||||
this( dir );
|
||||
}
|
||||
|
||||
public Word6Extractor( DirectoryNode dir ) throws IOException
|
||||
{
|
||||
this( new HWPFOldDocument( dir ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
|
@ -71,6 +91,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
|||
* Get the text from the word file, as an array with one String
|
||||
* per paragraph
|
||||
*/
|
||||
@Deprecated
|
||||
public String[] getParagraphText() {
|
||||
String[] ret;
|
||||
|
||||
|
@ -95,13 +116,25 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
|||
return ret;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
for(String t : getParagraphText()) {
|
||||
text.append(t);
|
||||
public String getText()
|
||||
{
|
||||
try
|
||||
{
|
||||
WordToTextConverter wordToTextConverter = new WordToTextConverter();
|
||||
wordToTextConverter.processDocument( doc );
|
||||
return wordToTextConverter.getText();
|
||||
}
|
||||
catch ( Exception exc )
|
||||
{
|
||||
// fall-back
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
return text.toString();
|
||||
for ( String t : getParagraphText() )
|
||||
{
|
||||
text.append( t );
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,9 +20,12 @@ package org.apache.poi.hwpf.extractor;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.poi.hwpf.converter.WordToTextConverter;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.usermodel.HeaderStories;
|
||||
|
@ -33,231 +36,300 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
|
||||
/**
|
||||
* Class to extract the text from a Word Document.
|
||||
*
|
||||
* You should use either getParagraphText() or getText() unless
|
||||
* you have a strong reason otherwise.
|
||||
*
|
||||
*
|
||||
* You should use either getParagraphText() or getText() unless you have a
|
||||
* strong reason otherwise.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public final class WordExtractor extends POIOLE2TextExtractor {
|
||||
private POIFSFileSystem fs;
|
||||
private HWPFDocument doc;
|
||||
public final class WordExtractor extends POIOLE2TextExtractor
|
||||
{
|
||||
private HWPFDocument doc;
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param is InputStream containing the word file
|
||||
*/
|
||||
public WordExtractor(InputStream is) throws IOException {
|
||||
this( HWPFDocument.verifyAndBuildPOIFS(is) );
|
||||
}
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
*
|
||||
* @param is
|
||||
* InputStream containing the word file
|
||||
*/
|
||||
public WordExtractor( InputStream is ) throws IOException
|
||||
{
|
||||
this( HWPFDocument.verifyAndBuildPOIFS( is ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param fs POIFSFileSystem containing the word file
|
||||
*/
|
||||
public WordExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HWPFDocument(fs));
|
||||
this.fs = fs;
|
||||
}
|
||||
public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HWPFDocument(dir, fs));
|
||||
this.fs = fs;
|
||||
}
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
*
|
||||
* @param fs
|
||||
* POIFSFileSystem containing the word file
|
||||
*/
|
||||
public WordExtractor( POIFSFileSystem fs ) throws IOException
|
||||
{
|
||||
this( new HWPFDocument( fs ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param doc The HWPFDocument to extract from
|
||||
*/
|
||||
public WordExtractor(HWPFDocument doc) {
|
||||
super(doc);
|
||||
this.doc = doc;
|
||||
}
|
||||
/**
|
||||
* @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
|
||||
throws IOException
|
||||
{
|
||||
this( dir );
|
||||
}
|
||||
|
||||
/**
|
||||
* Command line extractor, so people will stop moaning that
|
||||
* they can't just run this.
|
||||
*/
|
||||
public static void main(String[] args) throws IOException {
|
||||
if(args.length == 0) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
|
||||
System.exit(1);
|
||||
}
|
||||
public WordExtractor( DirectoryNode dir ) throws IOException
|
||||
{
|
||||
this( new HWPFDocument( dir ) );
|
||||
}
|
||||
|
||||
// Process the first argument as a file
|
||||
FileInputStream fin = new FileInputStream(args[0]);
|
||||
WordExtractor extractor = new WordExtractor(fin);
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
*
|
||||
* @param doc
|
||||
* The HWPFDocument to extract from
|
||||
*/
|
||||
public WordExtractor( HWPFDocument doc )
|
||||
{
|
||||
super( doc );
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text from the word file, as an array with one String
|
||||
* per paragraph
|
||||
*/
|
||||
public String[] getParagraphText() {
|
||||
String[] ret;
|
||||
/**
|
||||
* Command line extractor, so people will stop moaning that they can't just
|
||||
* run this.
|
||||
*/
|
||||
public static void main( String[] args ) throws IOException
|
||||
{
|
||||
if ( args.length == 0 )
|
||||
{
|
||||
System.err.println( "Use:" );
|
||||
System.err
|
||||
.println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
|
||||
System.exit( 1 );
|
||||
}
|
||||
|
||||
// Extract using the model code
|
||||
try {
|
||||
Range r = doc.getRange();
|
||||
// Process the first argument as a file
|
||||
FileInputStream fin = new FileInputStream( args[0] );
|
||||
WordExtractor extractor = new WordExtractor( fin );
|
||||
System.out.println( extractor.getText() );
|
||||
}
|
||||
|
||||
ret = getParagraphText(r);
|
||||
} catch (Exception e) {
|
||||
// Something's up with turning the text pieces into paragraphs
|
||||
// Fall back to ripping out the text pieces
|
||||
ret = new String[1];
|
||||
ret[0] = getTextFromPieces();
|
||||
/**
|
||||
* Get the text from the word file, as an array with one String per
|
||||
* paragraph
|
||||
*/
|
||||
public String[] getParagraphText()
|
||||
{
|
||||
String[] ret;
|
||||
|
||||
// Extract using the model code
|
||||
try
|
||||
{
|
||||
Range r = doc.getRange();
|
||||
|
||||
ret = getParagraphText( r );
|
||||
}
|
||||
catch ( Exception e )
|
||||
{
|
||||
// Something's up with turning the text pieces into paragraphs
|
||||
// Fall back to ripping out the text pieces
|
||||
ret = new String[1];
|
||||
ret[0] = getTextFromPieces();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String[] getFootnoteText()
|
||||
{
|
||||
Range r = doc.getFootnoteRange();
|
||||
|
||||
return getParagraphText( r );
|
||||
}
|
||||
|
||||
public String[] getMainTextboxText()
|
||||
{
|
||||
Range r = doc.getMainTextboxRange();
|
||||
|
||||
return getParagraphText( r );
|
||||
}
|
||||
|
||||
public String[] getEndnoteText()
|
||||
{
|
||||
Range r = doc.getEndnoteRange();
|
||||
|
||||
return getParagraphText( r );
|
||||
}
|
||||
|
||||
public String[] getCommentsText()
|
||||
{
|
||||
Range r = doc.getCommentsRange();
|
||||
|
||||
return getParagraphText( r );
|
||||
}
|
||||
|
||||
protected static String[] getParagraphText( Range r )
|
||||
{
|
||||
String[] ret;
|
||||
ret = new String[r.numParagraphs()];
|
||||
for ( int i = 0; i < ret.length; i++ )
|
||||
{
|
||||
Paragraph p = r.getParagraph( i );
|
||||
ret[i] = p.text();
|
||||
|
||||
// Fix the line ending
|
||||
if ( ret[i].endsWith( "\r" ) )
|
||||
{
|
||||
ret[i] = ret[i] + "\n";
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the header/footer text, if it's not empty
|
||||
*/
|
||||
private void appendHeaderFooter( String text, StringBuffer out )
|
||||
{
|
||||
if ( text == null || text.length() == 0 )
|
||||
return;
|
||||
|
||||
text = text.replace( '\r', '\n' );
|
||||
if ( !text.endsWith( "\n" ) )
|
||||
{
|
||||
out.append( text );
|
||||
out.append( '\n' );
|
||||
return;
|
||||
}
|
||||
if ( text.endsWith( "\n\n" ) )
|
||||
{
|
||||
out.append( text.substring( 0, text.length() - 1 ) );
|
||||
return;
|
||||
}
|
||||
out.append( text );
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the text from the headers
|
||||
*/
|
||||
@Deprecated
|
||||
public String getHeaderText()
|
||||
{
|
||||
HeaderStories hs = new HeaderStories( doc );
|
||||
|
||||
StringBuffer ret = new StringBuffer();
|
||||
if ( hs.getFirstHeader() != null )
|
||||
{
|
||||
appendHeaderFooter( hs.getFirstHeader(), ret );
|
||||
}
|
||||
if ( hs.getEvenHeader() != null )
|
||||
{
|
||||
appendHeaderFooter( hs.getEvenHeader(), ret );
|
||||
}
|
||||
if ( hs.getOddHeader() != null )
|
||||
{
|
||||
appendHeaderFooter( hs.getOddHeader(), ret );
|
||||
}
|
||||
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the text from the footers
|
||||
*/
|
||||
@Deprecated
|
||||
public String getFooterText()
|
||||
{
|
||||
HeaderStories hs = new HeaderStories( doc );
|
||||
|
||||
StringBuffer ret = new StringBuffer();
|
||||
if ( hs.getFirstFooter() != null )
|
||||
{
|
||||
appendHeaderFooter( hs.getFirstFooter(), ret );
|
||||
}
|
||||
if ( hs.getEvenFooter() != null )
|
||||
{
|
||||
appendHeaderFooter( hs.getEvenFooter(), ret );
|
||||
}
|
||||
if ( hs.getOddFooter() != null )
|
||||
{
|
||||
appendHeaderFooter( hs.getOddFooter(), ret );
|
||||
}
|
||||
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the text out of the text pieces. Might also include various bits of
|
||||
* crud, but will work in cases where the text piece -> paragraph mapping is
|
||||
* broken. Fast too.
|
||||
*/
|
||||
public String getTextFromPieces()
|
||||
{
|
||||
String text = doc.getDocumentText();
|
||||
|
||||
// Fix line endings (Note - won't get all of them
|
||||
text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
|
||||
text = text.replaceAll( "\r\r", "\r\n\r\n" );
|
||||
|
||||
if ( text.endsWith( "\r" ) )
|
||||
{
|
||||
text += "\n";
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the text, based on the WordToTextConverter. Shouldn't include any
|
||||
* crud, but slower than getTextFromPieces().
|
||||
*/
|
||||
public String getText()
|
||||
{
|
||||
try
|
||||
{
|
||||
final StringWriter stringWriter = new StringWriter();
|
||||
@SuppressWarnings( "unused" )
|
||||
WordToTextConverter wordToTextConverter = new WordToTextConverter()
|
||||
{
|
||||
{
|
||||
HeaderStories hs = new HeaderStories( doc );
|
||||
|
||||
if ( hs.getFirstHeaderSubrange() != null )
|
||||
processDocumentPart( doc, hs.getFirstHeaderSubrange() );
|
||||
if ( hs.getEvenHeaderSubrange() != null )
|
||||
processDocumentPart( doc, hs.getEvenHeaderSubrange() );
|
||||
if ( hs.getOddHeaderSubrange() != null )
|
||||
processDocumentPart( doc, hs.getOddHeaderSubrange() );
|
||||
|
||||
processDocument( doc );
|
||||
processDocumentPart( doc, doc.getMainTextboxRange() );
|
||||
|
||||
if ( hs.getFirstFooterSubrange() != null )
|
||||
processDocumentPart( doc, hs.getFirstFooterSubrange() );
|
||||
if ( hs.getEvenFooterSubrange() != null )
|
||||
processDocumentPart( doc, hs.getEvenFooterSubrange() );
|
||||
if ( hs.getOddFooterSubrange() != null )
|
||||
processDocumentPart( doc, hs.getOddFooterSubrange() );
|
||||
|
||||
stringWriter.append( getText() );
|
||||
}
|
||||
|
||||
return ret;
|
||||
};
|
||||
return stringWriter.toString();
|
||||
}
|
||||
|
||||
public String[] getFootnoteText() {
|
||||
Range r = doc.getFootnoteRange();
|
||||
|
||||
return getParagraphText(r);
|
||||
catch ( Exception exc )
|
||||
{
|
||||
throw new RuntimeException( exc );
|
||||
}
|
||||
}
|
||||
|
||||
public String[] getMainTextboxText() {
|
||||
Range r = doc.getMainTextboxRange();
|
||||
|
||||
return getParagraphText(r);
|
||||
}
|
||||
|
||||
public String[] getEndnoteText() {
|
||||
Range r = doc.getEndnoteRange();
|
||||
|
||||
return getParagraphText(r);
|
||||
}
|
||||
|
||||
public String[] getCommentsText() {
|
||||
Range r = doc.getCommentsRange();
|
||||
|
||||
return getParagraphText(r);
|
||||
}
|
||||
|
||||
protected static String[] getParagraphText(Range r) {
|
||||
String[] ret;
|
||||
ret = new String[r.numParagraphs()];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
Paragraph p = r.getParagraph(i);
|
||||
ret[i] = p.text();
|
||||
|
||||
// Fix the line ending
|
||||
if (ret[i].endsWith("\r")) {
|
||||
ret[i] = ret[i] + "\n";
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the header/footer text, if it's not empty
|
||||
*/
|
||||
private void appendHeaderFooter(String text, StringBuffer out) {
|
||||
if(text == null || text.length() == 0)
|
||||
return;
|
||||
|
||||
text = text.replace('\r', '\n');
|
||||
if(! text.endsWith("\n")) {
|
||||
out.append(text);
|
||||
out.append('\n');
|
||||
return;
|
||||
}
|
||||
if(text.endsWith("\n\n")) {
|
||||
out.append(text.substring(0, text.length()-1));
|
||||
return;
|
||||
}
|
||||
out.append(text);
|
||||
return;
|
||||
}
|
||||
/**
|
||||
* Grab the text from the headers
|
||||
*/
|
||||
public String getHeaderText() {
|
||||
HeaderStories hs = new HeaderStories(doc);
|
||||
|
||||
StringBuffer ret = new StringBuffer();
|
||||
if(hs.getFirstHeader() != null) {
|
||||
appendHeaderFooter(hs.getFirstHeader(), ret);
|
||||
}
|
||||
if(hs.getEvenHeader() != null) {
|
||||
appendHeaderFooter(hs.getEvenHeader(), ret);
|
||||
}
|
||||
if(hs.getOddHeader() != null) {
|
||||
appendHeaderFooter(hs.getOddHeader(), ret);
|
||||
}
|
||||
|
||||
return ret.toString();
|
||||
}
|
||||
/**
|
||||
* Grab the text from the footers
|
||||
*/
|
||||
public String getFooterText() {
|
||||
HeaderStories hs = new HeaderStories(doc);
|
||||
|
||||
StringBuffer ret = new StringBuffer();
|
||||
if(hs.getFirstFooter() != null) {
|
||||
appendHeaderFooter(hs.getFirstFooter(), ret);
|
||||
}
|
||||
if(hs.getEvenFooter() != null) {
|
||||
appendHeaderFooter(hs.getEvenFooter(), ret);
|
||||
}
|
||||
if(hs.getOddFooter() != null) {
|
||||
appendHeaderFooter(hs.getOddFooter(), ret);
|
||||
}
|
||||
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the text out of the text pieces. Might also include various
|
||||
* bits of crud, but will work in cases where the text piece -> paragraph
|
||||
* mapping is broken. Fast too.
|
||||
*/
|
||||
public String getTextFromPieces() {
|
||||
String text = doc.getDocumentText();
|
||||
|
||||
// Fix line endings (Note - won't get all of them
|
||||
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
|
||||
text = text.replaceAll("\r\r", "\r\n\r\n");
|
||||
|
||||
if(text.endsWith("\r")) {
|
||||
text += "\n";
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the text, based on the paragraphs. Shouldn't include any crud,
|
||||
* but slightly slower than getTextFromPieces().
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer ret = new StringBuffer();
|
||||
|
||||
ret.append(getHeaderText());
|
||||
|
||||
ArrayList<String> text = new ArrayList<String>();
|
||||
text.addAll(Arrays.asList(getParagraphText()));
|
||||
text.addAll(Arrays.asList(getMainTextboxText()));
|
||||
text.addAll(Arrays.asList(getFootnoteText()));
|
||||
text.addAll(Arrays.asList(getEndnoteText()));
|
||||
|
||||
for(String p : text) {
|
||||
ret.append(p);
|
||||
}
|
||||
|
||||
ret.append(getFooterText());
|
||||
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes any fields (eg macros, page markers etc)
|
||||
* from the string.
|
||||
*/
|
||||
public static String stripFields(String text) {
|
||||
return Range.stripFields(text);
|
||||
}
|
||||
/**
|
||||
* Removes any fields (eg macros, page markers etc) from the string.
|
||||
*/
|
||||
public static String stripFields( String text )
|
||||
{
|
||||
return Range.stripFields( text );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,17 +17,23 @@ public interface Field
|
|||
*/
|
||||
int getFieldStartOffset();
|
||||
|
||||
CharacterRun getMarkEndCharacterRun( Range parent );
|
||||
|
||||
/**
|
||||
* @return character position of end field mark
|
||||
*/
|
||||
int getMarkEndOffset();
|
||||
|
||||
CharacterRun getMarkSeparatorCharacterRun( Range parent );
|
||||
|
||||
/**
|
||||
* @return character position of separator field mark (if present,
|
||||
* {@link NullPointerException} otherwise)
|
||||
*/
|
||||
int getMarkSeparatorOffset();
|
||||
|
||||
CharacterRun getMarkStartCharacterRun( Range parent );
|
||||
|
||||
/**
|
||||
* @return character position of start field mark
|
||||
*/
|
||||
|
|
|
@ -112,6 +112,12 @@ class FieldImpl implements Field
|
|||
return startPlex.getFcStart();
|
||||
}
|
||||
|
||||
public CharacterRun getMarkEndCharacterRun( Range parent )
|
||||
{
|
||||
return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
|
||||
.getCharacterRun( 0 );
|
||||
}
|
||||
|
||||
/**
|
||||
* @return character position of end field mark
|
||||
*/
|
||||
|
@ -120,6 +126,15 @@ class FieldImpl implements Field
|
|||
return endPlex.getFcStart();
|
||||
}
|
||||
|
||||
public CharacterRun getMarkSeparatorCharacterRun( Range parent )
|
||||
{
|
||||
if ( !hasSeparator() )
|
||||
return null;
|
||||
|
||||
return new Range( getMarkSeparatorOffset(),
|
||||
getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
|
||||
}
|
||||
|
||||
/**
|
||||
* @return character position of separator field mark (if present,
|
||||
* {@link NullPointerException} otherwise)
|
||||
|
@ -129,6 +144,12 @@ class FieldImpl implements Field
|
|||
return separatorPlex.getFcStart();
|
||||
}
|
||||
|
||||
public CharacterRun getMarkStartCharacterRun( Range parent )
|
||||
{
|
||||
return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
|
||||
parent ).getCharacterRun( 0 );
|
||||
}
|
||||
|
||||
/**
|
||||
* @return character position of start field mark
|
||||
*/
|
||||
|
|
|
@ -82,35 +82,96 @@ public final class HeaderStories {
|
|||
fib.getPlcfHddSize(), 0 );
|
||||
}
|
||||
|
||||
public String getFootnoteSeparator() {
|
||||
return getAt(0);
|
||||
}
|
||||
public String getFootnoteContSeparator() {
|
||||
return getAt(1);
|
||||
}
|
||||
public String getFootnoteContNote() {
|
||||
return getAt(2);
|
||||
}
|
||||
public String getEndnoteSeparator() {
|
||||
return getAt(3);
|
||||
}
|
||||
public String getEndnoteContSeparator() {
|
||||
return getAt(4);
|
||||
}
|
||||
public String getEndnoteContNote() {
|
||||
return getAt(5);
|
||||
}
|
||||
@Deprecated
|
||||
public String getFootnoteSeparator()
|
||||
{
|
||||
return getAt( 0 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getFootnoteContSeparator()
|
||||
{
|
||||
return getAt( 1 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getFootnoteContNote()
|
||||
{
|
||||
return getAt( 2 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getEndnoteSeparator()
|
||||
{
|
||||
return getAt( 3 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getEndnoteContSeparator()
|
||||
{
|
||||
return getAt( 4 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getEndnoteContNote()
|
||||
{
|
||||
return getAt( 5 );
|
||||
}
|
||||
|
||||
public Range getFootnoteSeparatorSubrange()
|
||||
{
|
||||
return getSubrangeAt( 0 );
|
||||
}
|
||||
|
||||
public Range getFootnoteContSeparatorSubrange()
|
||||
{
|
||||
return getSubrangeAt( 1 );
|
||||
}
|
||||
|
||||
public Range getFootnoteContNoteSubrange()
|
||||
{
|
||||
return getSubrangeAt( 2 );
|
||||
}
|
||||
|
||||
public Range getEndnoteSeparatorSubrange()
|
||||
{
|
||||
return getSubrangeAt( 3 );
|
||||
}
|
||||
|
||||
public Range getEndnoteContSeparatorSubrange()
|
||||
{
|
||||
return getSubrangeAt( 4 );
|
||||
}
|
||||
|
||||
public Range getEndnoteContNoteSubrange()
|
||||
{
|
||||
return getSubrangeAt( 5 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getEvenHeader() {
|
||||
return getAt(6+0);
|
||||
}
|
||||
@Deprecated
|
||||
public String getOddHeader() {
|
||||
return getAt(6+1);
|
||||
}
|
||||
@Deprecated
|
||||
public String getFirstHeader() {
|
||||
return getAt(6+4);
|
||||
}
|
||||
|
||||
|
||||
public Range getEvenHeaderSubrange() {
|
||||
return getSubrangeAt(6+0);
|
||||
}
|
||||
public Range getOddHeaderSubrange() {
|
||||
return getSubrangeAt(6+1);
|
||||
}
|
||||
public Range getFirstHeaderSubrange() {
|
||||
return getSubrangeAt(6+4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the correct, defined header for the given
|
||||
* one based page
|
||||
|
@ -135,16 +196,39 @@ public final class HeaderStories {
|
|||
return getOddHeader();
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getEvenFooter()
|
||||
{
|
||||
return getAt( 6 + 2 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getOddFooter()
|
||||
{
|
||||
return getAt( 6 + 3 );
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public String getFirstFooter()
|
||||
{
|
||||
return getAt( 6 + 5 );
|
||||
}
|
||||
|
||||
public Range getEvenFooterSubrange()
|
||||
{
|
||||
return getSubrangeAt( 6 + 2 );
|
||||
}
|
||||
|
||||
public Range getOddFooterSubrange()
|
||||
{
|
||||
return getSubrangeAt( 6 + 3 );
|
||||
}
|
||||
|
||||
public Range getFirstFooterSubrange()
|
||||
{
|
||||
return getSubrangeAt( 6 + 5 );
|
||||
}
|
||||
|
||||
public String getEvenFooter() {
|
||||
return getAt(6+2);
|
||||
}
|
||||
public String getOddFooter() {
|
||||
return getAt(6+3);
|
||||
}
|
||||
public String getFirstFooter() {
|
||||
return getAt(6+5);
|
||||
}
|
||||
/**
|
||||
* Returns the correct, defined footer for the given
|
||||
* one based page
|
||||
|
@ -174,6 +258,7 @@ public final class HeaderStories {
|
|||
* Get the string that's pointed to by the
|
||||
* given plcfHdd index
|
||||
*/
|
||||
@Deprecated
|
||||
private String getAt(int plcfHddIndex) {
|
||||
if(plcfHdd == null) return null;
|
||||
|
||||
|
@ -209,6 +294,32 @@ public final class HeaderStories {
|
|||
return text;
|
||||
}
|
||||
|
||||
private Range getSubrangeAt( int plcfHddIndex )
|
||||
{
|
||||
if ( plcfHdd == null )
|
||||
return null;
|
||||
|
||||
GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
|
||||
if ( prop.getStart() == prop.getEnd() )
|
||||
{
|
||||
// Empty story
|
||||
return null;
|
||||
}
|
||||
if ( prop.getEnd() < prop.getStart() )
|
||||
{
|
||||
// Broken properties?
|
||||
return null;
|
||||
}
|
||||
|
||||
final int headersLength = headerStories.getEndOffset()
|
||||
- headerStories.getStartOffset();
|
||||
int start = Math.min( prop.getStart(), headersLength );
|
||||
int end = Math.min( prop.getEnd(), headersLength );
|
||||
|
||||
return new Range( headerStories.getStartOffset() + start,
|
||||
headerStories.getStartOffset() + end, headerStories );
|
||||
}
|
||||
|
||||
public Range getRange() {
|
||||
return headerStories;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.util.Internal;
|
||||
|
||||
@Internal
|
||||
public class ObjectPoolImpl implements ObjectsPool
|
||||
{
|
||||
private DirectoryEntry _objectPool;
|
||||
|
||||
public ObjectPoolImpl( DirectoryEntry _objectPool )
|
||||
{
|
||||
super();
|
||||
this._objectPool = _objectPool;
|
||||
}
|
||||
|
||||
public Entry getObjectById( String objId )
|
||||
{
|
||||
if ( _objectPool == null )
|
||||
return null;
|
||||
|
||||
try
|
||||
{
|
||||
return _objectPool.getEntry( objId );
|
||||
}
|
||||
catch ( FileNotFoundException exc )
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
|
||||
public interface ObjectsPool
|
||||
{
|
||||
public Entry getObjectById( String objId );
|
||||
}
|
|
@ -36,6 +36,8 @@ import org.apache.poi.hwpf.sprm.CharacterSprmCompressor;
|
|||
import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
|
||||
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* This class is the central class of the HWPF object model. All properties that
|
||||
|
@ -52,6 +54,8 @@ import org.apache.poi.util.LittleEndian;
|
|||
*/
|
||||
public class Range { // TODO -instantiable superclass
|
||||
|
||||
private POILogger logger = POILogFactory.getLogger( Range.class );
|
||||
|
||||
public static final int TYPE_PARAGRAPH = 0;
|
||||
public static final int TYPE_CHARACTER = 1;
|
||||
public static final int TYPE_SECTION = 2;
|
||||
|
@ -888,9 +892,12 @@ public class Range { // TODO -instantiable superclass
|
|||
initAll();
|
||||
if ( tableEndInclusive >= this._parEnd )
|
||||
{
|
||||
throw new ArrayIndexOutOfBoundsException(
|
||||
"The table's bounds fall outside of this Range" );
|
||||
logger.log( POILogger.WARN, "The table's bounds ", "["
|
||||
+ this._parStart + "; " + tableEndInclusive + ")",
|
||||
" fall outside of this Range paragraphs numbers ", "["
|
||||
+ this._parStart + "; " + this._parEnd + ")" );
|
||||
}
|
||||
|
||||
if ( tableEndInclusive < 0 )
|
||||
{
|
||||
throw new ArrayIndexOutOfBoundsException(
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
package org.apache.poi.hwpf.converter;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||
|
||||
public class TestWordToTextConverter extends TestCase
|
||||
{
|
||||
|
||||
/**
|
||||
* [FAILING] Bug 47731 - Word Extractor considers text copied from some
|
||||
* website as an embedded object
|
||||
*/
|
||||
public void testBug47731() throws Exception
|
||||
{
|
||||
HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
|
||||
String foundText = WordToTextConverter.getText( doc );
|
||||
|
||||
assertTrue( foundText
|
||||
.contains( "Soak the rice in water for three to four hours" ) );
|
||||
}
|
||||
}
|
|
@ -33,6 +33,16 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
* @author Nick Burch (nick at torchbox dot com)
|
||||
*/
|
||||
public final class TestWordExtractor extends TestCase {
|
||||
|
||||
public static void assertEquals( String expected, String actual )
|
||||
{
|
||||
String newExpected = expected.replaceAll( "\r\n", "\n" )
|
||||
.replaceAll( "\r", "\n" ).trim();
|
||||
String newActual = actual.replaceAll( "\r\n", "\n" )
|
||||
.replaceAll( "\r", "\n" ).trim();
|
||||
TestCase.assertEquals( newExpected, newActual );
|
||||
}
|
||||
|
||||
private String[] p_text1 = new String[] {
|
||||
"This is a simple word document\r\n",
|
||||
"\r\n",
|
||||
|
@ -107,12 +117,14 @@ public final class TestWordExtractor extends TestCase {
|
|||
public void testGetText() {
|
||||
assertEquals(p_text1_block, extractor.getText());
|
||||
|
||||
// For the 2nd, should give similar answers for
|
||||
// the two methods, differing only in line endings
|
||||
assertEquals(
|
||||
extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
|
||||
extractor2.getText().replaceAll("[\\r\\n]", ""));
|
||||
}
|
||||
// For the 2nd, should give similar answers for
|
||||
// the two methods, differing only in line endings
|
||||
|
||||
// nope, they must have different results, because of garbage
|
||||
// assertEquals(
|
||||
// extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
|
||||
// extractor2.getText().replaceAll("[\\r\\n]", ""));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test textPieces based extraction
|
||||
|
@ -330,7 +342,7 @@ public final class TestWordExtractor extends TestCase {
|
|||
|
||||
// Open directly
|
||||
for(DirectoryNode dir : files) {
|
||||
WordExtractor extractor = new WordExtractor(dir, null);
|
||||
WordExtractor extractor = new WordExtractor(dir);
|
||||
assertEquals(p_text1_block, extractor.getText());
|
||||
}
|
||||
|
||||
|
|
|
@ -43,6 +43,15 @@ import org.apache.poi.util.IOUtils;
|
|||
public class TestBugs extends TestCase
|
||||
{
|
||||
|
||||
public static void assertEquals( String expected, String actual )
|
||||
{
|
||||
String newExpected = expected.replaceAll( "\r\n", "\n" )
|
||||
.replaceAll( "\r", "\n" ).trim();
|
||||
String newActual = actual.replaceAll( "\r\n", "\n" )
|
||||
.replaceAll( "\r", "\n" ).trim();
|
||||
TestCase.assertEquals( newExpected, newActual );
|
||||
}
|
||||
|
||||
private static void assertTableStructures( Range expected, Range actual )
|
||||
{
|
||||
assertEquals( expected.numParagraphs(), actual.numParagraphs() );
|
||||
|
|
Loading…
Reference in New Issue