mirror of https://github.com/apache/poi.git
More Word 6 / Word 95 Support
HWPFOldDocument now processes a few more table sections, and so we can fake up some basic Ranges. This allows us to do paragraph level text extraction git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@960102 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c1d139babd
commit
2d9df14178
|
@ -34,6 +34,7 @@
|
||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-beta2" date="2010-??-??">
|
<release version="3.7-beta2" date="2010-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">Paragraph level as well as whole-file text extraction for Word 6/95 files through HWPF</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
|
<action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
|
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
|
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>
|
||||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||||
import org.apache.poi.hwpf.model.DocumentProperties;
|
import org.apache.poi.hwpf.model.DocumentProperties;
|
||||||
import org.apache.poi.hwpf.model.EscherRecordHolder;
|
import org.apache.poi.hwpf.model.EscherRecordHolder;
|
||||||
import org.apache.poi.hwpf.model.FSPATable;
|
import org.apache.poi.hwpf.model.FSPATable;
|
||||||
import org.apache.poi.hwpf.model.FileInformationBlock;
|
|
||||||
import org.apache.poi.hwpf.model.FontTable;
|
import org.apache.poi.hwpf.model.FontTable;
|
||||||
import org.apache.poi.hwpf.model.GenericPropertyNode;
|
import org.apache.poi.hwpf.model.GenericPropertyNode;
|
||||||
import org.apache.poi.hwpf.model.ListTables;
|
import org.apache.poi.hwpf.model.ListTables;
|
||||||
|
@ -83,24 +82,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||||
|
|
||||||
protected TextPieceTable _tpt;
|
protected TextPieceTable _tpt;
|
||||||
|
|
||||||
/** Contains formatting properties for text*/
|
|
||||||
protected CHPBinTable _cbt;
|
|
||||||
|
|
||||||
/** Contains formatting properties for paragraphs*/
|
|
||||||
protected PAPBinTable _pbt;
|
|
||||||
|
|
||||||
/** Contains formatting properties for sections.*/
|
|
||||||
protected SectionTable _st;
|
|
||||||
|
|
||||||
/** Holds styles for this document.*/
|
|
||||||
protected StyleSheet _ss;
|
|
||||||
|
|
||||||
/** Holds fonts for this document.*/
|
|
||||||
protected FontTable _ft;
|
|
||||||
|
|
||||||
/** Hold list tables */
|
|
||||||
protected ListTables _lt;
|
|
||||||
|
|
||||||
/** Holds the save history for this document. */
|
/** Holds the save history for this document. */
|
||||||
protected SavedByTable _sbt;
|
protected SavedByTable _sbt;
|
||||||
|
|
||||||
|
@ -277,15 +258,11 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public StyleSheet getStyleSheet()
|
public TextPieceTable getTextTable()
|
||||||
{
|
{
|
||||||
return _ss;
|
return _cft.getTextPieceTable();
|
||||||
}
|
}
|
||||||
|
|
||||||
public FileInformationBlock getFileInformationBlock()
|
|
||||||
{
|
|
||||||
return _fib;
|
|
||||||
}
|
|
||||||
public CPSplitCalculator getCPSplitCalculator()
|
public CPSplitCalculator getCPSplitCalculator()
|
||||||
{
|
{
|
||||||
return _cpSplit;
|
return _cpSplit;
|
||||||
|
@ -390,11 +367,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ListTables getListTables()
|
|
||||||
{
|
|
||||||
return _lt;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets a reference to the saved -by table, which holds the save history for the document.
|
* Gets a reference to the saved -by table, which holds the save history for the document.
|
||||||
*
|
*
|
||||||
|
@ -591,26 +563,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||||
pfs.writeFilesystem(out);
|
pfs.writeFilesystem(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
public CHPBinTable getCharacterTable()
|
|
||||||
{
|
|
||||||
return _cbt;
|
|
||||||
}
|
|
||||||
|
|
||||||
public PAPBinTable getParagraphTable()
|
|
||||||
{
|
|
||||||
return _pbt;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SectionTable getSectionTable()
|
|
||||||
{
|
|
||||||
return _st;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TextPieceTable getTextTable()
|
|
||||||
{
|
|
||||||
return _cft.getTextPieceTable();
|
|
||||||
}
|
|
||||||
|
|
||||||
public byte[] getDataStream()
|
public byte[] getDataStream()
|
||||||
{
|
{
|
||||||
return _dataStream;
|
return _dataStream;
|
||||||
|
@ -629,11 +581,6 @@ public final class HWPFDocument extends HWPFDocumentCore
|
||||||
return _lt.addList(list.getListData(), list.getOverride());
|
return _lt.addList(list.getListData(), list.getOverride());
|
||||||
}
|
}
|
||||||
|
|
||||||
public FontTable getFontTable()
|
|
||||||
{
|
|
||||||
return _ft;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void delete(int start, int length)
|
public void delete(int start, int length)
|
||||||
{
|
{
|
||||||
Range r = new Range(start, start + length, this);
|
Range r = new Range(start, start + length, this);
|
||||||
|
|
|
@ -23,7 +23,15 @@ import java.io.PushbackInputStream;
|
||||||
|
|
||||||
import org.apache.poi.EncryptedDocumentException;
|
import org.apache.poi.EncryptedDocumentException;
|
||||||
import org.apache.poi.POIDocument;
|
import org.apache.poi.POIDocument;
|
||||||
|
import org.apache.poi.hwpf.model.CHPBinTable;
|
||||||
import org.apache.poi.hwpf.model.FileInformationBlock;
|
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||||
|
import org.apache.poi.hwpf.model.FontTable;
|
||||||
|
import org.apache.poi.hwpf.model.ListTables;
|
||||||
|
import org.apache.poi.hwpf.model.PAPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.SectionTable;
|
||||||
|
import org.apache.poi.hwpf.model.StyleSheet;
|
||||||
|
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
@ -40,6 +48,24 @@ public abstract class HWPFDocumentCore extends POIDocument
|
||||||
/** The FIB */
|
/** The FIB */
|
||||||
protected FileInformationBlock _fib;
|
protected FileInformationBlock _fib;
|
||||||
|
|
||||||
|
/** Holds styles for this document.*/
|
||||||
|
protected StyleSheet _ss;
|
||||||
|
|
||||||
|
/** Contains formatting properties for text*/
|
||||||
|
protected CHPBinTable _cbt;
|
||||||
|
|
||||||
|
/** Contains formatting properties for paragraphs*/
|
||||||
|
protected PAPBinTable _pbt;
|
||||||
|
|
||||||
|
/** Contains formatting properties for sections.*/
|
||||||
|
protected SectionTable _st;
|
||||||
|
|
||||||
|
/** Holds fonts for this document.*/
|
||||||
|
protected FontTable _ft;
|
||||||
|
|
||||||
|
/** Hold list tables */
|
||||||
|
protected ListTables _lt;
|
||||||
|
|
||||||
/** main document stream buffer*/
|
/** main document stream buffer*/
|
||||||
protected byte[] _mainStream;
|
protected byte[] _mainStream;
|
||||||
|
|
||||||
|
@ -123,6 +149,44 @@ public abstract class HWPFDocumentCore extends POIDocument
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the range which covers the whole of the
|
||||||
|
* document, but excludes any headers and footers.
|
||||||
|
*/
|
||||||
|
public abstract Range getRange();
|
||||||
|
|
||||||
|
public abstract TextPieceTable getTextTable();
|
||||||
|
|
||||||
|
public CHPBinTable getCharacterTable()
|
||||||
|
{
|
||||||
|
return _cbt;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PAPBinTable getParagraphTable()
|
||||||
|
{
|
||||||
|
return _pbt;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SectionTable getSectionTable()
|
||||||
|
{
|
||||||
|
return _st;
|
||||||
|
}
|
||||||
|
|
||||||
|
public StyleSheet getStyleSheet()
|
||||||
|
{
|
||||||
|
return _ss;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ListTables getListTables()
|
||||||
|
{
|
||||||
|
return _lt;
|
||||||
|
}
|
||||||
|
|
||||||
|
public FontTable getFontTable()
|
||||||
|
{
|
||||||
|
return _ft;
|
||||||
|
}
|
||||||
|
|
||||||
public FileInformationBlock getFileInformationBlock()
|
public FileInformationBlock getFileInformationBlock()
|
||||||
{
|
{
|
||||||
return _fib;
|
return _fib;
|
||||||
|
|
|
@ -18,15 +18,15 @@ package org.apache.poi.hwpf;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.poi.hwpf.model.CHPX;
|
|
||||||
import org.apache.poi.hwpf.model.ComplexFileTable;
|
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||||
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.OldPAPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.OldSectionTable;
|
||||||
import org.apache.poi.hwpf.model.PieceDescriptor;
|
import org.apache.poi.hwpf.model.PieceDescriptor;
|
||||||
import org.apache.poi.hwpf.model.TextPiece;
|
import org.apache.poi.hwpf.model.TextPiece;
|
||||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
@ -34,11 +34,9 @@ import org.apache.poi.util.LittleEndian;
|
||||||
/**
|
/**
|
||||||
* Provides very simple support for old (Word 6 / Word 95)
|
* Provides very simple support for old (Word 6 / Word 95)
|
||||||
* files.
|
* files.
|
||||||
* TODO Provide a way to get at the properties associated
|
|
||||||
* with each block of text
|
|
||||||
*/
|
*/
|
||||||
public class HWPFOldDocument extends HWPFDocumentCore {
|
public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
|
private TextPieceTable tpt;
|
||||||
|
|
||||||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
||||||
this(fs.getRoot(), fs);
|
this(fs.getRoot(), fs);
|
||||||
|
@ -49,14 +47,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
super(directory, fs);
|
super(directory, fs);
|
||||||
|
|
||||||
// Where are things?
|
// Where are things?
|
||||||
|
int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
|
||||||
|
int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c);
|
||||||
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
|
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
|
||||||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
||||||
|
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
|
||||||
|
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
|
||||||
|
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
|
||||||
|
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
|
||||||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
||||||
|
|
||||||
// We need to get hold of the text that makes up the
|
// We need to get hold of the text that makes up the
|
||||||
// document, which might be regular or fast-saved
|
// document, which might be regular or fast-saved
|
||||||
StringBuffer text = new StringBuffer();
|
StringBuffer text = new StringBuffer();
|
||||||
TextPieceTable tpt;
|
|
||||||
if(_fib.isFComplex()) {
|
if(_fib.isFComplex()) {
|
||||||
ComplexFileTable cft = new ComplexFileTable(
|
ComplexFileTable cft = new ComplexFileTable(
|
||||||
_mainStream, _mainStream,
|
_mainStream, _mainStream,
|
||||||
|
@ -68,11 +71,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
text.append( tp.getStringBuffer() );
|
text.append( tp.getStringBuffer() );
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// TODO Discover if these older documents can ever hold Unicode Strings?
|
||||||
|
// (We think not, because they seem to lack a Piece table)
|
||||||
// TODO Build the Piece Descriptor properly
|
// TODO Build the Piece Descriptor properly
|
||||||
// TODO Can these old documents ever contain Unicode strings?
|
// (We have to fake it, as they don't seem to have a proper Piece table)
|
||||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
|
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
|
||||||
pd.setFilePosition(_fib.getFcMin());
|
pd.setFilePosition(_fib.getFcMin());
|
||||||
|
|
||||||
|
// Generate a single Text Piece Table, with a single Text Piece
|
||||||
|
// which covers all the (8 bit only) text in the file
|
||||||
tpt = new TextPieceTable();
|
tpt = new TextPieceTable();
|
||||||
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
|
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
|
||||||
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
|
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
|
||||||
|
@ -85,51 +92,34 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now we can fetch the character and paragraph properties
|
// Now we can fetch the character and paragraph properties
|
||||||
OldCHPBinTable chpTable = new OldCHPBinTable(
|
_cbt = new OldCHPBinTable(
|
||||||
_mainStream, chpTableOffset, chpTableSize,
|
_mainStream, chpTableOffset, chpTableSize,
|
||||||
_fib.getFcMin(), tpt
|
_fib.getFcMin(), tpt
|
||||||
);
|
);
|
||||||
|
_pbt = new OldPAPBinTable(
|
||||||
|
_mainStream, papTableOffset, papTableSize,
|
||||||
|
_fib.getFcMin(), tpt
|
||||||
|
);
|
||||||
|
_st = new OldSectionTable(
|
||||||
|
_mainStream, sedTableOffset, sedTableSize,
|
||||||
|
_fib.getFcMin(), tpt
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Finally build up runs
|
public Range getRange() {
|
||||||
for(CHPX chpx : chpTable.getTextRuns()) {
|
// Life is easy when we have no footers, headers or unicode!
|
||||||
String str = text.substring(chpx.getStart(), chpx.getEnd());
|
return new Range(
|
||||||
contents.add(new TextAndCHPX(str,chpx));
|
0, _fib.getFcMac() - _fib.getFcMin(), this
|
||||||
}
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TextPieceTable getTextTable()
|
||||||
|
{
|
||||||
|
return tpt;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void write(OutputStream out) throws IOException {
|
public void write(OutputStream out) throws IOException {
|
||||||
throw new IllegalStateException("Writing is not available for the older file formats");
|
throw new IllegalStateException("Writing is not available for the older file formats");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves all our text, in order, along with the
|
|
||||||
* CHPX information on each bit.
|
|
||||||
* Every entry has the same formatting, but as yet
|
|
||||||
* we've no way to tell what the formatting is...
|
|
||||||
* Warnings - this will change as soon as we support
|
|
||||||
* text formatting!
|
|
||||||
*/
|
|
||||||
public List<TextAndCHPX> getContents() {
|
|
||||||
return contents;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Warnings - this will change as soon as we support
|
|
||||||
* text formatting!
|
|
||||||
*/
|
|
||||||
public static class TextAndCHPX {
|
|
||||||
private String text;
|
|
||||||
private CHPX chpx;
|
|
||||||
private TextAndCHPX(String text, CHPX chpx) {
|
|
||||||
this.text = text;
|
|
||||||
this.chpx = chpx;
|
|
||||||
}
|
|
||||||
public String getText() {
|
|
||||||
return text;
|
|
||||||
}
|
|
||||||
public CHPX getChpx() {
|
|
||||||
return chpx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.io.InputStream;
|
||||||
|
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||||
import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
|
|
||||||
import org.apache.poi.hwpf.usermodel.Range;
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
@ -68,12 +67,41 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
||||||
this.doc = doc;
|
this.doc = doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
/**
|
||||||
|
* Get the text from the word file, as an array with one String
|
||||||
|
* per paragraph
|
||||||
|
*/
|
||||||
|
public String[] getParagraphText() {
|
||||||
|
String[] ret;
|
||||||
|
|
||||||
|
// Extract using the model code
|
||||||
|
try {
|
||||||
|
Range r = doc.getRange();
|
||||||
|
|
||||||
|
ret = WordExtractor.getParagraphText(r);
|
||||||
|
} catch (Exception e) {
|
||||||
|
// Something's up with turning the text pieces into paragraphs
|
||||||
|
// Fall back to ripping out the text pieces
|
||||||
|
ret = new String[doc.getTextTable().getTextPieces().size()];
|
||||||
|
for(int i=0; i<ret.length; i++) {
|
||||||
|
ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuffer().toString();
|
||||||
|
|
||||||
|
// Fix the line endings
|
||||||
|
ret[i].replaceAll("\r", "\ufffe");
|
||||||
|
ret[i].replaceAll("\ufffe","\r\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
public String getText() {
|
public String getText() {
|
||||||
StringBuffer text = new StringBuffer();
|
StringBuffer text = new StringBuffer();
|
||||||
for(TextAndCHPX tchpx : doc.getContents()) {
|
|
||||||
text.append( Range.stripFields(tchpx.getText()) );
|
for(String t : getParagraphText()) {
|
||||||
|
text.append(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,13 +17,12 @@
|
||||||
|
|
||||||
package org.apache.poi.hwpf.extractor;
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
@ -133,7 +132,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
||||||
return getParagraphText(r);
|
return getParagraphText(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String[] getParagraphText(Range r) {
|
protected static String[] getParagraphText(Range r) {
|
||||||
String[] ret;
|
String[] ret;
|
||||||
ret = new String[r.numParagraphs()];
|
ret = new String[r.numParagraphs()];
|
||||||
for (int i = 0; i < ret.length; i++) {
|
for (int i = 0; i < ret.length; i++) {
|
||||||
|
@ -215,10 +214,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
||||||
public String getTextFromPieces() {
|
public String getTextFromPieces() {
|
||||||
StringBuffer textBuf = new StringBuffer();
|
StringBuffer textBuf = new StringBuffer();
|
||||||
|
|
||||||
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
|
for(TextPiece piece : doc.getTextTable().getTextPieces()) {
|
||||||
while (textPieces.hasNext()) {
|
|
||||||
TextPiece piece = (TextPiece) textPieces.next();
|
|
||||||
|
|
||||||
String encoding = "Cp1252";
|
String encoding = "Cp1252";
|
||||||
if (piece.isUnicode()) {
|
if (piece.isUnicode()) {
|
||||||
encoding = "UTF-16LE";
|
encoding = "UTF-16LE";
|
||||||
|
|
|
@ -32,7 +32,7 @@ import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||||
*
|
*
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
public final class CHPBinTable
|
public class CHPBinTable
|
||||||
{
|
{
|
||||||
/** List of character properties.*/
|
/** List of character properties.*/
|
||||||
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
|
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
|
||||||
|
|
|
@ -17,9 +17,6 @@
|
||||||
|
|
||||||
package org.apache.poi.hwpf.model;
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.poi.poifs.common.POIFSConstants;
|
import org.apache.poi.poifs.common.POIFSConstants;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
|
@ -31,11 +28,8 @@ import org.apache.poi.util.LittleEndian;
|
||||||
* In common with the rest of the old support, it
|
* In common with the rest of the old support, it
|
||||||
* is read only
|
* is read only
|
||||||
*/
|
*/
|
||||||
public final class OldCHPBinTable
|
public final class OldCHPBinTable extends CHPBinTable
|
||||||
{
|
{
|
||||||
/** List of character properties.*/
|
|
||||||
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor used to read an old-style binTable
|
* Constructor used to read an old-style binTable
|
||||||
* in from a Word document.
|
* in from a Word document.
|
||||||
|
@ -69,9 +63,4 @@ public final class OldCHPBinTable
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<CHPX> getTextRuns()
|
|
||||||
{
|
|
||||||
return _textRuns;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import org.apache.poi.poifs.common.POIFSConstants;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class holds all of the paragraph formatting
|
||||||
|
* properties from Old (Word 6 / Word 95) documents.
|
||||||
|
* Unlike with Word 97+, it all gets held in the
|
||||||
|
* same stream.
|
||||||
|
* In common with the rest of the old support, it
|
||||||
|
* is read only
|
||||||
|
*/
|
||||||
|
public final class OldPAPBinTable extends PAPBinTable
|
||||||
|
{
|
||||||
|
public OldPAPBinTable(byte[] documentStream, int offset,
|
||||||
|
int size, int fcMin, TextPieceTable tpt)
|
||||||
|
{
|
||||||
|
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
|
||||||
|
|
||||||
|
int length = binTable.length();
|
||||||
|
for (int x = 0; x < length; x++)
|
||||||
|
{
|
||||||
|
GenericPropertyNode node = binTable.getProperty(x);
|
||||||
|
|
||||||
|
int pageNum = LittleEndian.getShort(node.getBytes());
|
||||||
|
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
|
||||||
|
|
||||||
|
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
|
||||||
|
documentStream, pageOffset, fcMin, tpt);
|
||||||
|
|
||||||
|
int fkpSize = pfkp.size();
|
||||||
|
|
||||||
|
for (int y = 0; y < fkpSize; y++)
|
||||||
|
{
|
||||||
|
PAPX papx = pfkp.getPAPX(y);
|
||||||
|
_paragraphs.add(papx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class holds all of the section formatting
|
||||||
|
* properties from Old (Word 6 / Word 95) documents.
|
||||||
|
* Unlike with Word 97+, it all gets held in the
|
||||||
|
* same stream.
|
||||||
|
* In common with the rest of the old support, it
|
||||||
|
* is read only
|
||||||
|
*/
|
||||||
|
public final class OldSectionTable extends SectionTable
|
||||||
|
{
|
||||||
|
public OldSectionTable(byte[] documentStream, int offset,
|
||||||
|
int size, int fcMin,
|
||||||
|
TextPieceTable tpt)
|
||||||
|
{
|
||||||
|
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
|
||||||
|
|
||||||
|
int length = sedPlex.length();
|
||||||
|
|
||||||
|
for (int x = 0; x < length; x++)
|
||||||
|
{
|
||||||
|
GenericPropertyNode node = sedPlex.getProperty(x);
|
||||||
|
SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
|
||||||
|
|
||||||
|
int fileOffset = sed.getFc();
|
||||||
|
int startAt = node.getStart();
|
||||||
|
int endAt = node.getEnd();
|
||||||
|
|
||||||
|
// check for the optimization
|
||||||
|
if (fileOffset == 0xffffffff)
|
||||||
|
{
|
||||||
|
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// The first short at the offset is the size of the grpprl.
|
||||||
|
int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
|
||||||
|
byte[] buf = new byte[sepxSize];
|
||||||
|
fileOffset += LittleEndian.SHORT_SIZE;
|
||||||
|
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
|
||||||
|
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -34,7 +34,7 @@ import org.apache.poi.util.LittleEndian;
|
||||||
*
|
*
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
public final class PAPBinTable
|
public class PAPBinTable
|
||||||
{
|
{
|
||||||
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
|
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
|
||||||
byte[] _dataStream;
|
byte[] _dataStream;
|
||||||
|
|
|
@ -112,7 +112,11 @@ public final class PAPX extends BytePropertyNode {
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return LittleEndian.getShort(buf);
|
if (buf.length == 1)
|
||||||
|
{
|
||||||
|
return (short)LittleEndian.getUnsignedByte(buf, 0);
|
||||||
|
}
|
||||||
|
return LittleEndian.getShort(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SprmBuffer getSprmBuf()
|
public SprmBuffer getSprmBuf()
|
||||||
|
@ -122,6 +126,11 @@ public final class PAPX extends BytePropertyNode {
|
||||||
|
|
||||||
public ParagraphProperties getParagraphProperties(StyleSheet ss)
|
public ParagraphProperties getParagraphProperties(StyleSheet ss)
|
||||||
{
|
{
|
||||||
|
if(ss == null) {
|
||||||
|
// TODO Fix up for Word 6/95
|
||||||
|
return new ParagraphProperties();
|
||||||
|
}
|
||||||
|
|
||||||
short istd = getIstd();
|
short istd = getIstd();
|
||||||
ParagraphProperties baseStyle = ss.getParagraphStyle(istd);
|
ParagraphProperties baseStyle = ss.getParagraphStyle(istd);
|
||||||
ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2);
|
ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2);
|
||||||
|
|
|
@ -27,12 +27,12 @@ import org.apache.poi.hwpf.model.io.*;
|
||||||
/**
|
/**
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
public final class SectionTable
|
public class SectionTable
|
||||||
{
|
{
|
||||||
private static final int SED_SIZE = 12;
|
private static final int SED_SIZE = 12;
|
||||||
|
|
||||||
protected ArrayList _sections = new ArrayList();
|
protected ArrayList<SEPX> _sections = new ArrayList<SEPX>();
|
||||||
protected List _text;
|
protected List<TextPiece> _text;
|
||||||
|
|
||||||
/** So we can know if things are unicode or not */
|
/** So we can know if things are unicode or not */
|
||||||
private TextPieceTable tpt;
|
private TextPieceTable tpt;
|
||||||
|
@ -84,7 +84,7 @@ public final class SectionTable
|
||||||
boolean matchAt = false;
|
boolean matchAt = false;
|
||||||
boolean matchHalf = false;
|
boolean matchHalf = false;
|
||||||
for(int i=0; i<_sections.size(); i++) {
|
for(int i=0; i<_sections.size(); i++) {
|
||||||
SEPX s = (SEPX)_sections.get(i);
|
SEPX s = _sections.get(i);
|
||||||
if(s.getEnd() == mainEndsAt) {
|
if(s.getEnd() == mainEndsAt) {
|
||||||
matchAt = true;
|
matchAt = true;
|
||||||
} else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
|
} else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
|
||||||
|
@ -94,7 +94,7 @@ public final class SectionTable
|
||||||
if(! matchAt && matchHalf) {
|
if(! matchAt && matchHalf) {
|
||||||
System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
|
System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
|
||||||
for(int i=0; i<_sections.size(); i++) {
|
for(int i=0; i<_sections.size(); i++) {
|
||||||
SEPX s = (SEPX)_sections.get(i);
|
SEPX s = _sections.get(i);
|
||||||
GenericPropertyNode node = sedPlex.getProperty(i);
|
GenericPropertyNode node = sedPlex.getProperty(i);
|
||||||
|
|
||||||
s.setStart( CPtoFC(node.getStart()) );
|
s.setStart( CPtoFC(node.getStart()) );
|
||||||
|
@ -106,12 +106,12 @@ public final class SectionTable
|
||||||
public void adjustForInsert(int listIndex, int length)
|
public void adjustForInsert(int listIndex, int length)
|
||||||
{
|
{
|
||||||
int size = _sections.size();
|
int size = _sections.size();
|
||||||
SEPX sepx = (SEPX)_sections.get(listIndex);
|
SEPX sepx = _sections.get(listIndex);
|
||||||
sepx.setEnd(sepx.getEnd() + length);
|
sepx.setEnd(sepx.getEnd() + length);
|
||||||
|
|
||||||
for (int x = listIndex + 1; x < size; x++)
|
for (int x = listIndex + 1; x < size; x++)
|
||||||
{
|
{
|
||||||
sepx = (SEPX)_sections.get(x);
|
sepx = _sections.get(x);
|
||||||
sepx.setStart(sepx.getStart() + length);
|
sepx.setStart(sepx.getStart() + length);
|
||||||
sepx.setEnd(sepx.getEnd() + length);
|
sepx.setEnd(sepx.getEnd() + length);
|
||||||
}
|
}
|
||||||
|
@ -129,7 +129,7 @@ public final class SectionTable
|
||||||
|
|
||||||
for(int i=_text.size()-1; i>-1; i--)
|
for(int i=_text.size()-1; i>-1; i--)
|
||||||
{
|
{
|
||||||
TP = (TextPiece)_text.get(i);
|
TP = _text.get(i);
|
||||||
|
|
||||||
if(CP >= TP.getCP()) break;
|
if(CP >= TP.getCP()) break;
|
||||||
}
|
}
|
||||||
|
@ -142,7 +142,7 @@ public final class SectionTable
|
||||||
return FC;
|
return FC;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ArrayList getSections()
|
public ArrayList<SEPX> getSections()
|
||||||
{
|
{
|
||||||
return _sections;
|
return _sections;
|
||||||
}
|
}
|
||||||
|
@ -159,7 +159,7 @@ public final class SectionTable
|
||||||
|
|
||||||
for (int x = 0; x < len; x++)
|
for (int x = 0; x < len; x++)
|
||||||
{
|
{
|
||||||
SEPX sepx = (SEPX)_sections.get(x);
|
SEPX sepx = _sections.get(x);
|
||||||
byte[] grpprl = sepx.getGrpprl();
|
byte[] grpprl = sepx.getGrpprl();
|
||||||
|
|
||||||
// write the sepx to the document stream. starts with a 2 byte size
|
// write the sepx to the document stream. starts with a 2 byte size
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.poi.hwpf.usermodel;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFDocumentCore;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
||||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||||
|
@ -77,7 +78,7 @@ public class Range { // TODO -instantiable superclass
|
||||||
protected int _end;
|
protected int _end;
|
||||||
|
|
||||||
/** The document this range blongs to. */
|
/** The document this range blongs to. */
|
||||||
protected HWPFDocument _doc;
|
protected HWPFDocumentCore _doc;
|
||||||
|
|
||||||
/** Have we loaded the section indexes yet */
|
/** Have we loaded the section indexes yet */
|
||||||
boolean _sectionRangeFound;
|
boolean _sectionRangeFound;
|
||||||
|
@ -144,7 +145,7 @@ public class Range { // TODO -instantiable superclass
|
||||||
* @param doc
|
* @param doc
|
||||||
* The HWPFDocument the range is based on.
|
* The HWPFDocument the range is based on.
|
||||||
*/
|
*/
|
||||||
public Range(int start, int end, HWPFDocument doc) {
|
public Range(int start, int end, HWPFDocumentCore doc) {
|
||||||
_start = start;
|
_start = start;
|
||||||
_end = end;
|
_end = end;
|
||||||
_doc = doc;
|
_doc = doc;
|
||||||
|
@ -1004,6 +1005,8 @@ public class Range { // TODO -instantiable superclass
|
||||||
* The (signed) value that should be added to the FIB CCP fields
|
* The (signed) value that should be added to the FIB CCP fields
|
||||||
*/
|
*/
|
||||||
protected void adjustFIB(int adjustment) {
|
protected void adjustFIB(int adjustment) {
|
||||||
|
assert (_doc instanceof HWPFDocument);
|
||||||
|
|
||||||
// update the FIB.CCPText field (this should happen once per adjustment,
|
// update the FIB.CCPText field (this should happen once per adjustment,
|
||||||
// so we don't want it in
|
// so we don't want it in
|
||||||
// adjustForInsert() or it would get updated multiple times if the range
|
// adjustForInsert() or it would get updated multiple times if the range
|
||||||
|
@ -1011,7 +1014,7 @@ public class Range { // TODO -instantiable superclass
|
||||||
// without this, OpenOffice.org (v. 2.2.x) does not see all the text in
|
// without this, OpenOffice.org (v. 2.2.x) does not see all the text in
|
||||||
// the document
|
// the document
|
||||||
|
|
||||||
CPSplitCalculator cpS = _doc.getCPSplitCalculator();
|
CPSplitCalculator cpS = ((HWPFDocument)_doc).getCPSplitCalculator();
|
||||||
FileInformationBlock fib = _doc.getFileInformationBlock();
|
FileInformationBlock fib = _doc.getFileInformationBlock();
|
||||||
|
|
||||||
// Do for each affected part
|
// Do for each affected part
|
||||||
|
@ -1066,7 +1069,7 @@ public class Range { // TODO -instantiable superclass
|
||||||
return _end;
|
return _end;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected HWPFDocument getDocument() {
|
protected HWPFDocumentCore getDocument() {
|
||||||
|
|
||||||
return _doc;
|
return _doc;
|
||||||
}
|
}
|
||||||
|
|
|
@ -256,6 +256,16 @@ public final class TestWordExtractor extends TestCase {
|
||||||
assertTrue(text.contains("Paragraph 2"));
|
assertTrue(text.contains("Paragraph 2"));
|
||||||
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
|
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
|
||||||
assertTrue(text.contains("Last (4th) paragraph"));
|
assertTrue(text.contains("Last (4th) paragraph"));
|
||||||
|
|
||||||
|
String[] tp = w6e.getParagraphText();
|
||||||
|
assertEquals(7, tp.length);
|
||||||
|
assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
|
||||||
|
assertEquals("\r\n", tp[1]);
|
||||||
|
assertEquals("Paragraph 2\r\n", tp[2]);
|
||||||
|
assertEquals("\r\n", tp[3]);
|
||||||
|
assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
|
||||||
|
assertEquals("\r\n", tp[5]);
|
||||||
|
assertEquals("Last (4th) paragraph.\r\n", tp[6]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWord6() throws Exception {
|
public void testWord6() throws Exception {
|
||||||
|
@ -273,5 +283,9 @@ public final class TestWordExtractor extends TestCase {
|
||||||
String text = w6e.getText();
|
String text = w6e.getText();
|
||||||
|
|
||||||
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
|
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
|
||||||
|
|
||||||
|
String[] tp = w6e.getParagraphText();
|
||||||
|
assertEquals(1, tp.length);
|
||||||
|
assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue