mirror of https://github.com/apache/poi.git
Fix support for sections in old word 6 / word 95 files
Improve unit testing for HWPFOldDocument Sprm fix also improves some HWPFDocument files too! git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@998131 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
44b21ca848
commit
8cd8659010
|
@ -34,6 +34,7 @@
|
||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-beta3" date="2010-??-??">
|
<release version="3.7-beta3" date="2010-??-??">
|
||||||
|
<action dev="poi-developers" type="fix">49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument)</action>
|
||||||
<action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
|
<action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
|
||||||
<action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
|
<action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
|
||||||
<action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>
|
<action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>
|
||||||
|
|
|
@ -245,22 +245,22 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
||||||
* but slightly slower than getTextFromPieces().
|
* but slightly slower than getTextFromPieces().
|
||||||
*/
|
*/
|
||||||
public String getText() {
|
public String getText() {
|
||||||
StringBuffer ret = new StringBuffer();
|
StringBuffer ret = new StringBuffer();
|
||||||
|
|
||||||
ret.append(getHeaderText());
|
ret.append(getHeaderText());
|
||||||
|
|
||||||
ArrayList<String> text = new ArrayList<String>();
|
ArrayList<String> text = new ArrayList<String>();
|
||||||
text.addAll(Arrays.asList(getParagraphText()));
|
text.addAll(Arrays.asList(getParagraphText()));
|
||||||
text.addAll(Arrays.asList(getFootnoteText()));
|
text.addAll(Arrays.asList(getFootnoteText()));
|
||||||
text.addAll(Arrays.asList(getEndnoteText()));
|
text.addAll(Arrays.asList(getEndnoteText()));
|
||||||
|
|
||||||
for(String p : text) {
|
for(String p : text) {
|
||||||
ret.append(p);
|
ret.append(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.append(getFooterText());
|
ret.append(getFooterText());
|
||||||
|
|
||||||
return ret.toString();
|
return ret.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -34,6 +34,7 @@ public final class OldSectionTable extends SectionTable
|
||||||
TextPieceTable tpt)
|
TextPieceTable tpt)
|
||||||
{
|
{
|
||||||
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
|
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
|
||||||
|
CharIsBytes charConv = new CharIsBytes(tpt);
|
||||||
|
|
||||||
int length = sedPlex.length();
|
int length = sedPlex.length();
|
||||||
|
|
||||||
|
@ -49,7 +50,7 @@ public final class OldSectionTable extends SectionTable
|
||||||
// check for the optimization
|
// check for the optimization
|
||||||
if (fileOffset == 0xffffffff)
|
if (fileOffset == 0xffffffff)
|
||||||
{
|
{
|
||||||
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
|
_sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0]));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -58,8 +59,32 @@ public final class OldSectionTable extends SectionTable
|
||||||
byte[] buf = new byte[sepxSize];
|
byte[] buf = new byte[sepxSize];
|
||||||
fileOffset += LittleEndian.SHORT_SIZE;
|
fileOffset += LittleEndian.SHORT_SIZE;
|
||||||
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
|
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
|
||||||
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
|
_sections.add(new SEPX(sed, startAt, endAt, charConv, buf));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class CharIsBytes implements CharIndexTranslator {
|
||||||
|
private TextPieceTable tpt;
|
||||||
|
private CharIsBytes(TextPieceTable tpt) {
|
||||||
|
this.tpt = tpt;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCharIndex(int bytePos, int startCP) {
|
||||||
|
return bytePos;
|
||||||
|
}
|
||||||
|
public int getCharIndex(int bytePos) {
|
||||||
|
return bytePos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isIndexInTable(int bytePos) {
|
||||||
|
return tpt.isIndexInTable(bytePos);
|
||||||
|
}
|
||||||
|
public int lookIndexBackward(int bytePos) {
|
||||||
|
return tpt.lookIndexBackward(bytePos);
|
||||||
|
}
|
||||||
|
public int lookIndexForward(int bytePos) {
|
||||||
|
return tpt.lookIndexForward(bytePos);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -216,6 +216,8 @@ public class TextPieceTable implements CharIndexTranslator {
|
||||||
|
|
||||||
if (bytePos< pieceStart || bytePos > pieceEnd) {
|
if (bytePos< pieceStart || bytePos > pieceEnd) {
|
||||||
toAdd = bytesLength;
|
toAdd = bytesLength;
|
||||||
|
} else if (bytePos > pieceStart && bytePos < pieceEnd) {
|
||||||
|
toAdd = (bytePos - pieceStart);
|
||||||
} else {
|
} else {
|
||||||
toAdd = bytesLength - (pieceEnd - bytePos);
|
toAdd = bytesLength - (pieceEnd - bytePos);
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,7 +37,8 @@ public final class SprmIterator
|
||||||
|
|
||||||
public boolean hasNext()
|
public boolean hasNext()
|
||||||
{
|
{
|
||||||
return _offset < _grpprl.length;
|
// A Sprm is at least 2 bytes long
|
||||||
|
return _offset < (_grpprl.length-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SprmOperation next()
|
public SprmOperation next()
|
||||||
|
|
|
@ -20,22 +20,68 @@ package org.apache.poi.hwpf;
|
||||||
import junit.framework.Test;
|
import junit.framework.Test;
|
||||||
import junit.framework.TestSuite;
|
import junit.framework.TestSuite;
|
||||||
|
|
||||||
import org.apache.poi.hwpf.model.*;
|
import org.apache.poi.hwpf.extractor.TestWordExtractor;
|
||||||
|
import org.apache.poi.hwpf.extractor.TestWordExtractorBugs;
|
||||||
|
import org.apache.poi.hwpf.model.TestCHPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.TestDocumentProperties;
|
||||||
|
import org.apache.poi.hwpf.model.TestFileInformationBlock;
|
||||||
|
import org.apache.poi.hwpf.model.TestFontTable;
|
||||||
|
import org.apache.poi.hwpf.model.TestListTables;
|
||||||
|
import org.apache.poi.hwpf.model.TestPAPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.TestPlexOfCps;
|
||||||
|
import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable;
|
||||||
|
import org.apache.poi.hwpf.model.TestSavedByTable;
|
||||||
|
import org.apache.poi.hwpf.model.TestSectionTable;
|
||||||
|
import org.apache.poi.hwpf.model.TestStyleSheet;
|
||||||
|
import org.apache.poi.hwpf.model.TestTextPieceTable;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestBug46610;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestHeaderStories;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestPictures;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestProblems;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestRange;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestRangeDelete;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestRangeInsertion;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestRangeProperties;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestRangeReplacement;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TestShapes;
|
||||||
|
|
||||||
public final class AllHWPFTests {
|
public final class AllHWPFTests {
|
||||||
|
|
||||||
public static Test suite() {
|
public static Test suite() {
|
||||||
TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
|
TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
|
||||||
|
|
||||||
|
suite.addTestSuite(TestHWPFPictures.class);
|
||||||
|
suite.addTestSuite(TestHWPFRangeParts.class);
|
||||||
|
|
||||||
|
suite.addTestSuite(TestWordExtractor.class);
|
||||||
|
suite.addTestSuite(TestWordExtractorBugs.class);
|
||||||
|
|
||||||
suite.addTestSuite(TestCHPBinTable.class);
|
suite.addTestSuite(TestCHPBinTable.class);
|
||||||
suite.addTestSuite(TestDocumentProperties.class);
|
suite.addTestSuite(TestDocumentProperties.class);
|
||||||
suite.addTestSuite(TestFileInformationBlock.class);
|
suite.addTestSuite(TestFileInformationBlock.class);
|
||||||
suite.addTestSuite(TestFontTable.class);
|
suite.addTestSuite(TestFontTable.class);
|
||||||
|
suite.addTestSuite(TestListTables.class);
|
||||||
suite.addTestSuite(TestPAPBinTable.class);
|
suite.addTestSuite(TestPAPBinTable.class);
|
||||||
suite.addTestSuite(TestPlexOfCps.class);
|
suite.addTestSuite(TestPlexOfCps.class);
|
||||||
|
suite.addTestSuite(TestRevisionMarkAuthorTable.class);
|
||||||
|
suite.addTestSuite(TestSavedByTable.class);
|
||||||
suite.addTestSuite(TestSectionTable.class);
|
suite.addTestSuite(TestSectionTable.class);
|
||||||
suite.addTestSuite(TestStyleSheet.class);
|
suite.addTestSuite(TestStyleSheet.class);
|
||||||
suite.addTestSuite(TestTextPieceTable.class);
|
suite.addTestSuite(TestTextPieceTable.class);
|
||||||
suite.addTestSuite(TestListTables.class);
|
|
||||||
|
suite.addTestSuite(TestBug46610.class);
|
||||||
|
suite.addTestSuite(TestHeaderStories.class);
|
||||||
|
suite.addTestSuite(TestHWPFOldDocument.class);
|
||||||
|
suite.addTestSuite(TestPictures.class);
|
||||||
|
suite.addTestSuite(TestProblems.class);
|
||||||
|
suite.addTestSuite(TestRange.class);
|
||||||
|
suite.addTestSuite(TestRangeDelete.class);
|
||||||
|
suite.addTestSuite(TestRangeInsertion.class);
|
||||||
|
suite.addTestSuite(TestRangeProperties.class);
|
||||||
|
suite.addTestSuite(TestRangeReplacement.class);
|
||||||
|
suite.addTestSuite(TestShapes.class);
|
||||||
|
|
||||||
return suite;
|
return suite;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.poi.hwpf;
|
package org.apache.poi.hwpf;
|
||||||
|
|
||||||
import org.apache.poi.POIDataSamples;
|
import org.apache.poi.POIDataSamples;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
||||||
|
@ -30,6 +31,14 @@ public class HWPFTestDataSamples {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
public static HWPFOldDocument openOldSampleFile(String sampleFileName) {
|
||||||
|
try {
|
||||||
|
InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);
|
||||||
|
return new HWPFOldDocument(new POIFSFileSystem(is));
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back
|
* Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back
|
||||||
* from a <tt>ByteArrayInputStream</tt>.<p/>
|
* from a <tt>ByteArrayInputStream</tt>.<p/>
|
||||||
|
|
|
@ -52,7 +52,7 @@ public final class TestWordExtractor extends TestCase {
|
||||||
|
|
||||||
// Well behaved document
|
// Well behaved document
|
||||||
private WordExtractor extractor;
|
private WordExtractor extractor;
|
||||||
// Corrupted document - can't do paragraph based stuff
|
// Slightly iffy document
|
||||||
private WordExtractor extractor2;
|
private WordExtractor extractor2;
|
||||||
// A word doc embeded in an excel file
|
// A word doc embeded in an excel file
|
||||||
private String filename3;
|
private String filename3;
|
||||||
|
@ -93,8 +93,11 @@ public final class TestWordExtractor extends TestCase {
|
||||||
assertEquals(p_text1[i], text[i]);
|
assertEquals(p_text1[i], text[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// On second one, should fall back
|
// Lots of paragraphs with only a few lines in them
|
||||||
assertEquals(1, extractor2.getParagraphText().length);
|
assertEquals(24, extractor2.getParagraphText().length);
|
||||||
|
assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
|
||||||
|
assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
|
||||||
|
assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -103,8 +106,11 @@ public final class TestWordExtractor extends TestCase {
|
||||||
public void testGetText() {
|
public void testGetText() {
|
||||||
assertEquals(p_text1_block, extractor.getText());
|
assertEquals(p_text1_block, extractor.getText());
|
||||||
|
|
||||||
// On second one, should fall back to text piece
|
// For the 2nd, should give similar answers for
|
||||||
assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
|
// the two methods, differing only in line endings
|
||||||
|
assertEquals(
|
||||||
|
extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
|
||||||
|
extractor2.getText().replaceAll("[\\r\\n]", ""));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.usermodel;
|
||||||
|
|
||||||
|
import org.apache.poi.OldFileFormatException;
|
||||||
|
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFTestCase;
|
||||||
|
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for Word 6 and Word 95 support
|
||||||
|
*/
|
||||||
|
public final class TestHWPFOldDocument extends HWPFTestCase {
|
||||||
|
/**
|
||||||
|
* Test a simple Word 6 document
|
||||||
|
*/
|
||||||
|
public void testWord6() throws Exception {
|
||||||
|
// Can't open as HWPFDocument
|
||||||
|
try {
|
||||||
|
HWPFTestDataSamples.openSampleFile("Word6.doc");
|
||||||
|
fail("Shouldn't be openable");
|
||||||
|
} catch(OldFileFormatException e) {}
|
||||||
|
|
||||||
|
// Open
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
|
||||||
|
|
||||||
|
// Check
|
||||||
|
assertEquals(1, doc.getRange().numSections());
|
||||||
|
assertEquals(1, doc.getRange().numParagraphs());
|
||||||
|
assertEquals(1, doc.getRange().numCharacterRuns());
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"The quick brown fox jumps over the lazy dog\r",
|
||||||
|
doc.getRange().getParagraph(0).text()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test a simple Word 95 document
|
||||||
|
*/
|
||||||
|
public void testWord95() throws Exception {
|
||||||
|
// Can't open as HWPFDocument
|
||||||
|
try {
|
||||||
|
HWPFTestDataSamples.openSampleFile("Word95.doc");
|
||||||
|
fail("Shouldn't be openable");
|
||||||
|
} catch(OldFileFormatException e) {}
|
||||||
|
|
||||||
|
// Open
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc");
|
||||||
|
|
||||||
|
// Check
|
||||||
|
assertEquals(1, doc.getRange().numSections());
|
||||||
|
assertEquals(7, doc.getRange().numParagraphs());
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"The quick brown fox jumps over the lazy dog\r",
|
||||||
|
doc.getRange().getParagraph(0).text()
|
||||||
|
);
|
||||||
|
assertEquals("\r", doc.getRange().getParagraph(1).text());
|
||||||
|
assertEquals(
|
||||||
|
"Paragraph 2\r",
|
||||||
|
doc.getRange().getParagraph(2).text()
|
||||||
|
);
|
||||||
|
assertEquals("\r", doc.getRange().getParagraph(3).text());
|
||||||
|
assertEquals(
|
||||||
|
"Paragraph 3. Has some RED text and some " +
|
||||||
|
"BLUE BOLD text in it.\r",
|
||||||
|
doc.getRange().getParagraph(4).text()
|
||||||
|
);
|
||||||
|
assertEquals("\r", doc.getRange().getParagraph(5).text());
|
||||||
|
assertEquals(
|
||||||
|
"Last (4th) paragraph.\r",
|
||||||
|
doc.getRange().getParagraph(6).text()
|
||||||
|
);
|
||||||
|
|
||||||
|
assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
|
||||||
|
assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
|
||||||
|
assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
|
||||||
|
assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
|
||||||
|
// Normal, red, normal, blue+bold, normal
|
||||||
|
assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
|
||||||
|
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
|
||||||
|
// Normal, superscript for 4th, normal
|
||||||
|
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test a word document that has sections,
|
||||||
|
* as well as the usual paragraph stuff.
|
||||||
|
*/
|
||||||
|
public void testWord6Sections() throws Exception {
|
||||||
|
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
|
||||||
|
|
||||||
|
assertEquals(3, doc.getRange().numSections());
|
||||||
|
assertEquals(6, doc.getRange().numParagraphs());
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
"This is a test.\r",
|
||||||
|
doc.getRange().getParagraph(0).text()
|
||||||
|
);
|
||||||
|
assertEquals("\r", doc.getRange().getParagraph(1).text());
|
||||||
|
assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line?
|
||||||
|
assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text());
|
||||||
|
assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line?
|
||||||
|
assertEquals("\r", doc.getRange().getParagraph(5).text());
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
Loading…
Reference in New Issue