Fix support for sections in old word 6 / word 95 files

Improve unit testing for HWPFOldDocument
Sprm fix also improves some HWPFDocument files too!


git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@998131 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-09-17 13:46:11 +00:00
parent 44b21ca848
commit 8cd8659010
10 changed files with 233 additions and 21 deletions

View File

@ -34,6 +34,7 @@
<changes> <changes>
<release version="3.7-beta3" date="2010-??-??"> <release version="3.7-beta3" date="2010-??-??">
<action dev="poi-developers" type="fix">49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument)</action>
<action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action> <action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
<action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action> <action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
<action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action> <action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>

View File

@ -245,22 +245,22 @@ public final class WordExtractor extends POIOLE2TextExtractor {
* but slightly slower than getTextFromPieces(). * but slightly slower than getTextFromPieces().
*/ */
public String getText() { public String getText() {
StringBuffer ret = new StringBuffer(); StringBuffer ret = new StringBuffer();
ret.append(getHeaderText()); ret.append(getHeaderText());
ArrayList<String> text = new ArrayList<String>(); ArrayList<String> text = new ArrayList<String>();
text.addAll(Arrays.asList(getParagraphText())); text.addAll(Arrays.asList(getParagraphText()));
text.addAll(Arrays.asList(getFootnoteText())); text.addAll(Arrays.asList(getFootnoteText()));
text.addAll(Arrays.asList(getEndnoteText())); text.addAll(Arrays.asList(getEndnoteText()));
for(String p : text) { for(String p : text) {
ret.append(p); ret.append(p);
} }
ret.append(getFooterText()); ret.append(getFooterText());
return ret.toString(); return ret.toString();
} }
/** /**

View File

@ -34,6 +34,7 @@ public final class OldSectionTable extends SectionTable
TextPieceTable tpt) TextPieceTable tpt)
{ {
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12); PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
CharIsBytes charConv = new CharIsBytes(tpt);
int length = sedPlex.length(); int length = sedPlex.length();
@ -49,7 +50,7 @@ public final class OldSectionTable extends SectionTable
// check for the optimization // check for the optimization
if (fileOffset == 0xffffffff) if (fileOffset == 0xffffffff)
{ {
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0])); _sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0]));
} }
else else
{ {
@ -58,8 +59,32 @@ public final class OldSectionTable extends SectionTable
byte[] buf = new byte[sepxSize]; byte[] buf = new byte[sepxSize];
fileOffset += LittleEndian.SHORT_SIZE; fileOffset += LittleEndian.SHORT_SIZE;
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length); System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf)); _sections.add(new SEPX(sed, startAt, endAt, charConv, buf));
} }
} }
} }
private static class CharIsBytes implements CharIndexTranslator {
private TextPieceTable tpt;
private CharIsBytes(TextPieceTable tpt) {
this.tpt = tpt;
}
public int getCharIndex(int bytePos, int startCP) {
return bytePos;
}
public int getCharIndex(int bytePos) {
return bytePos;
}
public boolean isIndexInTable(int bytePos) {
return tpt.isIndexInTable(bytePos);
}
public int lookIndexBackward(int bytePos) {
return tpt.lookIndexBackward(bytePos);
}
public int lookIndexForward(int bytePos) {
return tpt.lookIndexForward(bytePos);
}
}
} }

View File

@ -216,6 +216,8 @@ public class TextPieceTable implements CharIndexTranslator {
if (bytePos< pieceStart || bytePos > pieceEnd) { if (bytePos< pieceStart || bytePos > pieceEnd) {
toAdd = bytesLength; toAdd = bytesLength;
} else if (bytePos > pieceStart && bytePos < pieceEnd) {
toAdd = (bytePos - pieceStart);
} else { } else {
toAdd = bytesLength - (pieceEnd - bytePos); toAdd = bytesLength - (pieceEnd - bytePos);
} }

View File

@ -37,7 +37,8 @@ public final class SprmIterator
public boolean hasNext() public boolean hasNext()
{ {
return _offset < _grpprl.length; // A Sprm is at least 2 bytes long
return _offset < (_grpprl.length-1);
} }
public SprmOperation next() public SprmOperation next()

View File

@ -20,22 +20,68 @@ package org.apache.poi.hwpf;
import junit.framework.Test; import junit.framework.Test;
import junit.framework.TestSuite; import junit.framework.TestSuite;
import org.apache.poi.hwpf.model.*; import org.apache.poi.hwpf.extractor.TestWordExtractor;
import org.apache.poi.hwpf.extractor.TestWordExtractorBugs;
import org.apache.poi.hwpf.model.TestCHPBinTable;
import org.apache.poi.hwpf.model.TestDocumentProperties;
import org.apache.poi.hwpf.model.TestFileInformationBlock;
import org.apache.poi.hwpf.model.TestFontTable;
import org.apache.poi.hwpf.model.TestListTables;
import org.apache.poi.hwpf.model.TestPAPBinTable;
import org.apache.poi.hwpf.model.TestPlexOfCps;
import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable;
import org.apache.poi.hwpf.model.TestSavedByTable;
import org.apache.poi.hwpf.model.TestSectionTable;
import org.apache.poi.hwpf.model.TestStyleSheet;
import org.apache.poi.hwpf.model.TestTextPieceTable;
import org.apache.poi.hwpf.usermodel.TestBug46610;
import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument;
import org.apache.poi.hwpf.usermodel.TestHeaderStories;
import org.apache.poi.hwpf.usermodel.TestPictures;
import org.apache.poi.hwpf.usermodel.TestProblems;
import org.apache.poi.hwpf.usermodel.TestRange;
import org.apache.poi.hwpf.usermodel.TestRangeDelete;
import org.apache.poi.hwpf.usermodel.TestRangeInsertion;
import org.apache.poi.hwpf.usermodel.TestRangeProperties;
import org.apache.poi.hwpf.usermodel.TestRangeReplacement;
import org.apache.poi.hwpf.usermodel.TestShapes;
public final class AllHWPFTests { public final class AllHWPFTests {
public static Test suite() { public static Test suite() {
TestSuite suite = new TestSuite(AllHWPFTests.class.getName()); TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
suite.addTestSuite(TestHWPFPictures.class);
suite.addTestSuite(TestHWPFRangeParts.class);
suite.addTestSuite(TestWordExtractor.class);
suite.addTestSuite(TestWordExtractorBugs.class);
suite.addTestSuite(TestCHPBinTable.class); suite.addTestSuite(TestCHPBinTable.class);
suite.addTestSuite(TestDocumentProperties.class); suite.addTestSuite(TestDocumentProperties.class);
suite.addTestSuite(TestFileInformationBlock.class); suite.addTestSuite(TestFileInformationBlock.class);
suite.addTestSuite(TestFontTable.class); suite.addTestSuite(TestFontTable.class);
suite.addTestSuite(TestListTables.class);
suite.addTestSuite(TestPAPBinTable.class); suite.addTestSuite(TestPAPBinTable.class);
suite.addTestSuite(TestPlexOfCps.class); suite.addTestSuite(TestPlexOfCps.class);
suite.addTestSuite(TestRevisionMarkAuthorTable.class);
suite.addTestSuite(TestSavedByTable.class);
suite.addTestSuite(TestSectionTable.class); suite.addTestSuite(TestSectionTable.class);
suite.addTestSuite(TestStyleSheet.class); suite.addTestSuite(TestStyleSheet.class);
suite.addTestSuite(TestTextPieceTable.class); suite.addTestSuite(TestTextPieceTable.class);
suite.addTestSuite(TestListTables.class);
suite.addTestSuite(TestBug46610.class);
suite.addTestSuite(TestHeaderStories.class);
suite.addTestSuite(TestHWPFOldDocument.class);
suite.addTestSuite(TestPictures.class);
suite.addTestSuite(TestProblems.class);
suite.addTestSuite(TestRange.class);
suite.addTestSuite(TestRangeDelete.class);
suite.addTestSuite(TestRangeInsertion.class);
suite.addTestSuite(TestRangeProperties.class);
suite.addTestSuite(TestRangeReplacement.class);
suite.addTestSuite(TestShapes.class);
return suite; return suite;
} }
} }

View File

@ -17,6 +17,7 @@
package org.apache.poi.hwpf; package org.apache.poi.hwpf;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import java.io.*; import java.io.*;
@ -30,6 +31,14 @@ public class HWPFTestDataSamples {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
public static HWPFOldDocument openOldSampleFile(String sampleFileName) {
try {
InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);
return new HWPFOldDocument(new POIFSFileSystem(is));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/** /**
* Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back * Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back
* from a <tt>ByteArrayInputStream</tt>.<p/> * from a <tt>ByteArrayInputStream</tt>.<p/>

View File

@ -52,7 +52,7 @@ public final class TestWordExtractor extends TestCase {
// Well behaved document // Well behaved document
private WordExtractor extractor; private WordExtractor extractor;
// Corrupted document - can't do paragraph based stuff // Slightly iffy document
private WordExtractor extractor2; private WordExtractor extractor2;
// A word doc embeded in an excel file // A word doc embeded in an excel file
private String filename3; private String filename3;
@ -93,8 +93,11 @@ public final class TestWordExtractor extends TestCase {
assertEquals(p_text1[i], text[i]); assertEquals(p_text1[i], text[i]);
} }
// On second one, should fall back // Lots of paragraphs with only a few lines in them
assertEquals(1, extractor2.getParagraphText().length); assertEquals(24, extractor2.getParagraphText().length);
assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
} }
/** /**
@ -103,8 +106,11 @@ public final class TestWordExtractor extends TestCase {
public void testGetText() { public void testGetText() {
assertEquals(p_text1_block, extractor.getText()); assertEquals(p_text1_block, extractor.getText());
// On second one, should fall back to text piece // For the 2nd, should give similar answers for
assertEquals(extractor2.getTextFromPieces(), extractor2.getText()); // the two methods, differing only in line endings
assertEquals(
extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
extractor2.getText().replaceAll("[\\r\\n]", ""));
} }
/** /**

View File

@ -0,0 +1,122 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.usermodel;
import org.apache.poi.OldFileFormatException;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.HWPFTestCase;
import org.apache.poi.hwpf.HWPFTestDataSamples;
/**
* Tests for Word 6 and Word 95 support
*/
public final class TestHWPFOldDocument extends HWPFTestCase {
/**
* Test a simple Word 6 document
*/
public void testWord6() throws Exception {
// Can't open as HWPFDocument
try {
HWPFTestDataSamples.openSampleFile("Word6.doc");
fail("Shouldn't be openable");
} catch(OldFileFormatException e) {}
// Open
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
// Check
assertEquals(1, doc.getRange().numSections());
assertEquals(1, doc.getRange().numParagraphs());
assertEquals(1, doc.getRange().numCharacterRuns());
assertEquals(
"The quick brown fox jumps over the lazy dog\r",
doc.getRange().getParagraph(0).text()
);
}
/**
* Test a simple Word 95 document
*/
public void testWord95() throws Exception {
// Can't open as HWPFDocument
try {
HWPFTestDataSamples.openSampleFile("Word95.doc");
fail("Shouldn't be openable");
} catch(OldFileFormatException e) {}
// Open
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc");
// Check
assertEquals(1, doc.getRange().numSections());
assertEquals(7, doc.getRange().numParagraphs());
assertEquals(
"The quick brown fox jumps over the lazy dog\r",
doc.getRange().getParagraph(0).text()
);
assertEquals("\r", doc.getRange().getParagraph(1).text());
assertEquals(
"Paragraph 2\r",
doc.getRange().getParagraph(2).text()
);
assertEquals("\r", doc.getRange().getParagraph(3).text());
assertEquals(
"Paragraph 3. Has some RED text and some " +
"BLUE BOLD text in it.\r",
doc.getRange().getParagraph(4).text()
);
assertEquals("\r", doc.getRange().getParagraph(5).text());
assertEquals(
"Last (4th) paragraph.\r",
doc.getRange().getParagraph(6).text()
);
assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
// Normal, red, normal, blue+bold, normal
assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
// Normal, superscript for 4th, normal
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
}
/**
* Test a word document that has sections,
* as well as the usual paragraph stuff.
*/
public void testWord6Sections() throws Exception {
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
assertEquals(3, doc.getRange().numSections());
assertEquals(6, doc.getRange().numParagraphs());
assertEquals(
"This is a test.\r",
doc.getRange().getParagraph(0).text()
);
assertEquals("\r", doc.getRange().getParagraph(1).text());
assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line?
assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text());
assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line?
assertEquals("\r", doc.getRange().getParagraph(5).text());
}
}

Binary file not shown.