More Word 6 / Word 95 Support

HWPFOldDocument now processes a few more table sections, and so we can fake up some
 basic Ranges. This allows us to do paragraph level text extraction


git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@960102 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-07-02 20:59:30 +00:00
parent c1d139babd
commit 2d9df14178
15 changed files with 308 additions and 143 deletions

View File

@ -34,6 +34,7 @@
<changes>
<release version="3.7-beta2" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="add">Paragraph level as well as whole-file text extraction for Word 6/95 files through HWPF</action>
<action dev="POI-DEVELOPERS" type="add">Text Extraction support for older Word 6 and Word 95 files via HWPF</action>
<action dev="POI-DEVELOPERS" type="add">49508 - Allow the addition of paragraphs to XWPF Table Cells</action>
<action dev="POI-DEVELOPERS" type="fix">49446 - Don't consider 17.16.23 field codes as properly part of the paragraph's text</action>

View File

@ -31,7 +31,6 @@ import org.apache.poi.hwpf.model.ComplexFileTable;
import org.apache.poi.hwpf.model.DocumentProperties;
import org.apache.poi.hwpf.model.EscherRecordHolder;
import org.apache.poi.hwpf.model.FSPATable;
import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.hwpf.model.FontTable;
import org.apache.poi.hwpf.model.GenericPropertyNode;
import org.apache.poi.hwpf.model.ListTables;
@ -83,24 +82,6 @@ public final class HWPFDocument extends HWPFDocumentCore
protected TextPieceTable _tpt;
/** Contains formatting properties for text*/
protected CHPBinTable _cbt;
/** Contains formatting properties for paragraphs*/
protected PAPBinTable _pbt;
/** Contains formatting properties for sections.*/
protected SectionTable _st;
/** Holds styles for this document.*/
protected StyleSheet _ss;
/** Holds fonts for this document.*/
protected FontTable _ft;
/** Hold list tables */
protected ListTables _lt;
/** Holds the save history for this document. */
protected SavedByTable _sbt;
@ -277,15 +258,11 @@ public final class HWPFDocument extends HWPFDocumentCore
}
}
public StyleSheet getStyleSheet()
public TextPieceTable getTextTable()
{
return _ss;
return _cft.getTextPieceTable();
}
public FileInformationBlock getFileInformationBlock()
{
return _fib;
}
public CPSplitCalculator getCPSplitCalculator()
{
return _cpSplit;
@ -390,11 +367,6 @@ public final class HWPFDocument extends HWPFDocumentCore
return length;
}
public ListTables getListTables()
{
return _lt;
}
/**
* Gets a reference to the saved -by table, which holds the save history for the document.
*
@ -591,26 +563,6 @@ public final class HWPFDocument extends HWPFDocumentCore
pfs.writeFilesystem(out);
}
public CHPBinTable getCharacterTable()
{
return _cbt;
}
public PAPBinTable getParagraphTable()
{
return _pbt;
}
public SectionTable getSectionTable()
{
return _st;
}
public TextPieceTable getTextTable()
{
return _cft.getTextPieceTable();
}
public byte[] getDataStream()
{
return _dataStream;
@ -629,11 +581,6 @@ public final class HWPFDocument extends HWPFDocumentCore
return _lt.addList(list.getListData(), list.getOverride());
}
public FontTable getFontTable()
{
return _ft;
}
public void delete(int start, int length)
{
Range r = new Range(start, start + length, this);

View File

@ -23,7 +23,15 @@ import java.io.PushbackInputStream;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDocument;
import org.apache.poi.hwpf.model.CHPBinTable;
import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.hwpf.model.FontTable;
import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.model.PAPBinTable;
import org.apache.poi.hwpf.model.SectionTable;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -40,6 +48,24 @@ public abstract class HWPFDocumentCore extends POIDocument
/** The FIB */
protected FileInformationBlock _fib;
/** Holds styles for this document.*/
protected StyleSheet _ss;
/** Contains formatting properties for text*/
protected CHPBinTable _cbt;
/** Contains formatting properties for paragraphs*/
protected PAPBinTable _pbt;
/** Contains formatting properties for sections.*/
protected SectionTable _st;
/** Holds fonts for this document.*/
protected FontTable _ft;
/** Hold list tables */
protected ListTables _lt;
/** main document stream buffer*/
protected byte[] _mainStream;
@ -123,6 +149,44 @@ public abstract class HWPFDocumentCore extends POIDocument
}
}
/**
* Returns the range which covers the whole of the
* document, but excludes any headers and footers.
*/
public abstract Range getRange();
public abstract TextPieceTable getTextTable();
public CHPBinTable getCharacterTable()
{
return _cbt;
}
public PAPBinTable getParagraphTable()
{
return _pbt;
}
public SectionTable getSectionTable()
{
return _st;
}
public StyleSheet getStyleSheet()
{
return _ss;
}
public ListTables getListTables()
{
return _lt;
}
public FontTable getFontTable()
{
return _ft;
}
public FileInformationBlock getFileInformationBlock()
{
return _fib;

View File

@ -18,15 +18,15 @@ package org.apache.poi.hwpf;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.ComplexFileTable;
import org.apache.poi.hwpf.model.OldCHPBinTable;
import org.apache.poi.hwpf.model.OldPAPBinTable;
import org.apache.poi.hwpf.model.OldSectionTable;
import org.apache.poi.hwpf.model.PieceDescriptor;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
@ -34,11 +34,9 @@ import org.apache.poi.util.LittleEndian;
/**
* Provides very simple support for old (Word 6 / Word 95)
* files.
* TODO Provide a way to get at the properties associated
* with each block of text
*/
public class HWPFOldDocument extends HWPFDocumentCore {
private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
private TextPieceTable tpt;
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot(), fs);
@ -49,14 +47,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
super(directory, fs);
// Where are things?
int sedTableOffset = LittleEndian.getInt(_mainStream, 0x88);
int sedTableSize = LittleEndian.getInt(_mainStream, 0x8c);
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
// We need to get hold of the text that makes up the
// document, which might be regular or fast-saved
StringBuffer text = new StringBuffer();
TextPieceTable tpt;
if(_fib.isFComplex()) {
ComplexFileTable cft = new ComplexFileTable(
_mainStream, _mainStream,
@ -68,11 +71,15 @@ public class HWPFOldDocument extends HWPFDocumentCore {
text.append( tp.getStringBuffer() );
}
} else {
// TODO Discover if these older documents can ever hold Unicode Strings?
// (We think not, because they seem to lack a Piece table)
// TODO Build the Piece Descriptor properly
// TODO Can these old documents ever contain Unicode strings?
// (We have to fake it, as they don't seem to have a proper Piece table)
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
pd.setFilePosition(_fib.getFcMin());
// Generate a single Text Piece Table, with a single Text Piece
// which covers all the (8 bit only) text in the file
tpt = new TextPieceTable();
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
@ -85,51 +92,34 @@ public class HWPFOldDocument extends HWPFDocumentCore {
}
// Now we can fetch the character and paragraph properties
OldCHPBinTable chpTable = new OldCHPBinTable(
_cbt = new OldCHPBinTable(
_mainStream, chpTableOffset, chpTableSize,
_fib.getFcMin(), tpt
);
_pbt = new OldPAPBinTable(
_mainStream, papTableOffset, papTableSize,
_fib.getFcMin(), tpt
);
_st = new OldSectionTable(
_mainStream, sedTableOffset, sedTableSize,
_fib.getFcMin(), tpt
);
}
// Finally build up runs
for(CHPX chpx : chpTable.getTextRuns()) {
String str = text.substring(chpx.getStart(), chpx.getEnd());
contents.add(new TextAndCHPX(str,chpx));
}
public Range getRange() {
// Life is easy when we have no footers, headers or unicode!
return new Range(
0, _fib.getFcMac() - _fib.getFcMin(), this
);
}
public TextPieceTable getTextTable()
{
return tpt;
}
@Override
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Writing is not available for the older file formats");
}
/**
* Retrieves all our text, in order, along with the
* CHPX information on each bit.
* Every entry has the same formatting, but as yet
* we've no way to tell what the formatting is...
* Warnings - this will change as soon as we support
* text formatting!
*/
public List<TextAndCHPX> getContents() {
return contents;
}
/**
* Warnings - this will change as soon as we support
* text formatting!
*/
public static class TextAndCHPX {
private String text;
private CHPX chpx;
private TextAndCHPX(String text, CHPX chpx) {
this.text = text;
this.chpx = chpx;
}
public String getText() {
return text;
}
public CHPX getChpx() {
return chpx;
}
}
}

View File

@ -22,7 +22,6 @@ import java.io.InputStream;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -68,12 +67,41 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
this.doc = doc;
}
@Override
/**
* Get the text from the word file, as an array with one String
* per paragraph
*/
public String[] getParagraphText() {
String[] ret;
// Extract using the model code
try {
Range r = doc.getRange();
ret = WordExtractor.getParagraphText(r);
} catch (Exception e) {
// Something's up with turning the text pieces into paragraphs
// Fall back to ripping out the text pieces
ret = new String[doc.getTextTable().getTextPieces().size()];
for(int i=0; i<ret.length; i++) {
ret[i] = doc.getTextTable().getTextPieces().get(i).getStringBuffer().toString();
// Fix the line endings
ret[i].replaceAll("\r", "\ufffe");
ret[i].replaceAll("\ufffe","\r\n");
}
}
return ret;
}
public String getText() {
StringBuffer text = new StringBuffer();
for(TextAndCHPX tchpx : doc.getContents()) {
text.append( Range.stripFields(tchpx.getText()) );
for(String t : getParagraphText()) {
text.append(t);
}
return text.toString();
}
}

View File

@ -17,13 +17,12 @@
package org.apache.poi.hwpf.extractor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
@ -133,7 +132,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
return getParagraphText(r);
}
private String[] getParagraphText(Range r) {
protected static String[] getParagraphText(Range r) {
String[] ret;
ret = new String[r.numParagraphs()];
for (int i = 0; i < ret.length; i++) {
@ -215,10 +214,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
public String getTextFromPieces() {
StringBuffer textBuf = new StringBuffer();
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
while (textPieces.hasNext()) {
TextPiece piece = (TextPiece) textPieces.next();
for(TextPiece piece : doc.getTextTable().getTextPieces()) {
String encoding = "Cp1252";
if (piece.isUnicode()) {
encoding = "UTF-16LE";

View File

@ -32,7 +32,7 @@ import org.apache.poi.hwpf.sprm.SprmBuffer;
*
* @author Ryan Ackley
*/
public final class CHPBinTable
public class CHPBinTable
{
/** List of character properties.*/
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();

View File

@ -17,9 +17,6 @@
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.LittleEndian;
@ -31,11 +28,8 @@ import org.apache.poi.util.LittleEndian;
* In common with the rest of the old support, it
* is read only
*/
public final class OldCHPBinTable
public final class OldCHPBinTable extends CHPBinTable
{
/** List of character properties.*/
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
/**
* Constructor used to read an old-style binTable
* in from a Word document.
@ -69,9 +63,4 @@ public final class OldCHPBinTable
}
}
}
public List<CHPX> getTextRuns()
{
return _textRuns;
}
}

View File

@ -0,0 +1,59 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.LittleEndian;
/**
* This class holds all of the paragraph formatting
* properties from Old (Word 6 / Word 95) documents.
* Unlike with Word 97+, it all gets held in the
* same stream.
* In common with the rest of the old support, it
* is read only
*/
public final class OldPAPBinTable extends PAPBinTable
{
public OldPAPBinTable(byte[] documentStream, int offset,
int size, int fcMin, TextPieceTable tpt)
{
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
int length = binTable.length();
for (int x = 0; x < length; x++)
{
GenericPropertyNode node = binTable.getProperty(x);
int pageNum = LittleEndian.getShort(node.getBytes());
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(documentStream,
documentStream, pageOffset, fcMin, tpt);
int fkpSize = pfkp.size();
for (int y = 0; y < fkpSize; y++)
{
PAPX papx = pfkp.getPAPX(y);
_paragraphs.add(papx);
}
}
}
}

View File

@ -0,0 +1,65 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import org.apache.poi.util.LittleEndian;
/**
* This class holds all of the section formatting
* properties from Old (Word 6 / Word 95) documents.
* Unlike with Word 97+, it all gets held in the
* same stream.
* In common with the rest of the old support, it
* is read only
*/
public final class OldSectionTable extends SectionTable
{
public OldSectionTable(byte[] documentStream, int offset,
int size, int fcMin,
TextPieceTable tpt)
{
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
int length = sedPlex.length();
for (int x = 0; x < length; x++)
{
GenericPropertyNode node = sedPlex.getProperty(x);
SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);
int fileOffset = sed.getFc();
int startAt = node.getStart();
int endAt = node.getEnd();
// check for the optimization
if (fileOffset == 0xffffffff)
{
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
}
else
{
// The first short at the offset is the size of the grpprl.
int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
byte[] buf = new byte[sepxSize];
fileOffset += LittleEndian.SHORT_SIZE;
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
}
}
}
}

View File

@ -34,7 +34,7 @@ import org.apache.poi.util.LittleEndian;
*
* @author Ryan Ackley
*/
public final class PAPBinTable
public class PAPBinTable
{
protected ArrayList<PAPX> _paragraphs = new ArrayList<PAPX>();
byte[] _dataStream;

View File

@ -112,7 +112,11 @@ public final class PAPX extends BytePropertyNode {
{
return 0;
}
return LittleEndian.getShort(buf);
if (buf.length == 1)
{
return (short)LittleEndian.getUnsignedByte(buf, 0);
}
return LittleEndian.getShort(buf);
}
public SprmBuffer getSprmBuf()
@ -122,6 +126,11 @@ public final class PAPX extends BytePropertyNode {
public ParagraphProperties getParagraphProperties(StyleSheet ss)
{
if(ss == null) {
// TODO Fix up for Word 6/95
return new ParagraphProperties();
}
short istd = getIstd();
ParagraphProperties baseStyle = ss.getParagraphStyle(istd);
ParagraphProperties props = ParagraphSprmUncompressor.uncompressPAP(baseStyle, getGrpprl(), 2);

View File

@ -27,12 +27,12 @@ import org.apache.poi.hwpf.model.io.*;
/**
* @author Ryan Ackley
*/
public final class SectionTable
public class SectionTable
{
private static final int SED_SIZE = 12;
protected ArrayList _sections = new ArrayList();
protected List _text;
protected ArrayList<SEPX> _sections = new ArrayList<SEPX>();
protected List<TextPiece> _text;
/** So we can know if things are unicode or not */
private TextPieceTable tpt;
@ -84,7 +84,7 @@ public final class SectionTable
boolean matchAt = false;
boolean matchHalf = false;
for(int i=0; i<_sections.size(); i++) {
SEPX s = (SEPX)_sections.get(i);
SEPX s = _sections.get(i);
if(s.getEnd() == mainEndsAt) {
matchAt = true;
} else if(s.getEndBytes() == mainEndsAt || s.getEndBytes() == mainEndsAt-1) {
@ -94,7 +94,7 @@ public final class SectionTable
if(! matchAt && matchHalf) {
System.err.println("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
for(int i=0; i<_sections.size(); i++) {
SEPX s = (SEPX)_sections.get(i);
SEPX s = _sections.get(i);
GenericPropertyNode node = sedPlex.getProperty(i);
s.setStart( CPtoFC(node.getStart()) );
@ -106,12 +106,12 @@ public final class SectionTable
public void adjustForInsert(int listIndex, int length)
{
int size = _sections.size();
SEPX sepx = (SEPX)_sections.get(listIndex);
SEPX sepx = _sections.get(listIndex);
sepx.setEnd(sepx.getEnd() + length);
for (int x = listIndex + 1; x < size; x++)
{
sepx = (SEPX)_sections.get(x);
sepx = _sections.get(x);
sepx.setStart(sepx.getStart() + length);
sepx.setEnd(sepx.getEnd() + length);
}
@ -129,7 +129,7 @@ public final class SectionTable
for(int i=_text.size()-1; i>-1; i--)
{
TP = (TextPiece)_text.get(i);
TP = _text.get(i);
if(CP >= TP.getCP()) break;
}
@ -142,7 +142,7 @@ public final class SectionTable
return FC;
}
public ArrayList getSections()
public ArrayList<SEPX> getSections()
{
return _sections;
}
@ -159,7 +159,7 @@ public final class SectionTable
for (int x = 0; x < len; x++)
{
SEPX sepx = (SEPX)_sections.get(x);
SEPX sepx = _sections.get(x);
byte[] grpprl = sepx.getGrpprl();
// write the sepx to the document stream. starts with a 2 byte size

View File

@ -20,6 +20,7 @@ package org.apache.poi.hwpf.usermodel;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
@ -77,7 +78,7 @@ public class Range { // TODO -instantiable superclass
protected int _end;
/** The document this range blongs to. */
protected HWPFDocument _doc;
protected HWPFDocumentCore _doc;
/** Have we loaded the section indexes yet */
boolean _sectionRangeFound;
@ -144,7 +145,7 @@ public class Range { // TODO -instantiable superclass
* @param doc
* The HWPFDocument the range is based on.
*/
public Range(int start, int end, HWPFDocument doc) {
public Range(int start, int end, HWPFDocumentCore doc) {
_start = start;
_end = end;
_doc = doc;
@ -1004,6 +1005,8 @@ public class Range { // TODO -instantiable superclass
* The (signed) value that should be added to the FIB CCP fields
*/
protected void adjustFIB(int adjustment) {
assert (_doc instanceof HWPFDocument);
// update the FIB.CCPText field (this should happen once per adjustment,
// so we don't want it in
// adjustForInsert() or it would get updated multiple times if the range
@ -1011,7 +1014,7 @@ public class Range { // TODO -instantiable superclass
// without this, OpenOffice.org (v. 2.2.x) does not see all the text in
// the document
CPSplitCalculator cpS = _doc.getCPSplitCalculator();
CPSplitCalculator cpS = ((HWPFDocument)_doc).getCPSplitCalculator();
FileInformationBlock fib = _doc.getFileInformationBlock();
// Do for each affected part
@ -1066,7 +1069,7 @@ public class Range { // TODO -instantiable superclass
return _end;
}
protected HWPFDocument getDocument() {
protected HWPFDocumentCore getDocument() {
return _doc;
}

View File

@ -256,6 +256,16 @@ public final class TestWordExtractor extends TestCase {
assertTrue(text.contains("Paragraph 2"));
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
assertTrue(text.contains("Last (4th) paragraph"));
String[] tp = w6e.getParagraphText();
assertEquals(7, tp.length);
assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
assertEquals("\r\n", tp[1]);
assertEquals("Paragraph 2\r\n", tp[2]);
assertEquals("\r\n", tp[3]);
assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
assertEquals("\r\n", tp[5]);
assertEquals("Last (4th) paragraph.\r\n", tp[6]);
}
public void testWord6() throws Exception {
@ -273,5 +283,9 @@ public final class TestWordExtractor extends TestCase {
String text = w6e.getText();
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
String[] tp = w6e.getParagraphText();
assertEquals(1, tp.length);
assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
}
}