mirror of https://github.com/apache/poi.git
Fix support for sections in old word 6 / word 95 files
Improve unit testing for HWPFOldDocument Sprm fix also improves some HWPFDocument files too! git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@998131 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
44b21ca848
commit
8cd8659010
|
@ -34,6 +34,7 @@
|
|||
|
||||
<changes>
|
||||
<release version="3.7-beta3" date="2010-??-??">
|
||||
<action dev="poi-developers" type="fix">49933 - Support sections in Word 6 and Word 95 files (HWPFOldDocument)</action>
|
||||
<action dev="poi-developers" type="fix">49941 - Correctly handle space preservation of XSSFRichTextRuns when applying fonts to parts of the string</action>
|
||||
<action dev="poi-developers" type="fix">Correct XWPFRun detection of bold/italic in a paragraph with multiple runs of different styles</action>
|
||||
<action dev="poi-developers" type="add">Link XWPFPicture to XWPFRun, so that embedded pictures can be access from where they live in the text stream</action>
|
||||
|
|
|
@ -245,22 +245,22 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
|||
* but slightly slower than getTextFromPieces().
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer ret = new StringBuffer();
|
||||
StringBuffer ret = new StringBuffer();
|
||||
|
||||
ret.append(getHeaderText());
|
||||
ret.append(getHeaderText());
|
||||
|
||||
ArrayList<String> text = new ArrayList<String>();
|
||||
text.addAll(Arrays.asList(getParagraphText()));
|
||||
text.addAll(Arrays.asList(getFootnoteText()));
|
||||
text.addAll(Arrays.asList(getEndnoteText()));
|
||||
ArrayList<String> text = new ArrayList<String>();
|
||||
text.addAll(Arrays.asList(getParagraphText()));
|
||||
text.addAll(Arrays.asList(getFootnoteText()));
|
||||
text.addAll(Arrays.asList(getEndnoteText()));
|
||||
|
||||
for(String p : text) {
|
||||
ret.append(p);
|
||||
}
|
||||
for(String p : text) {
|
||||
ret.append(p);
|
||||
}
|
||||
|
||||
ret.append(getFooterText());
|
||||
ret.append(getFooterText());
|
||||
|
||||
return ret.toString();
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -34,6 +34,7 @@ public final class OldSectionTable extends SectionTable
|
|||
TextPieceTable tpt)
|
||||
{
|
||||
PlexOfCps sedPlex = new PlexOfCps(documentStream, offset, size, 12);
|
||||
CharIsBytes charConv = new CharIsBytes(tpt);
|
||||
|
||||
int length = sedPlex.length();
|
||||
|
||||
|
@ -49,7 +50,7 @@ public final class OldSectionTable extends SectionTable
|
|||
// check for the optimization
|
||||
if (fileOffset == 0xffffffff)
|
||||
{
|
||||
_sections.add(new SEPX(sed, startAt, endAt, tpt, new byte[0]));
|
||||
_sections.add(new SEPX(sed, startAt, endAt, charConv, new byte[0]));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -58,8 +59,32 @@ public final class OldSectionTable extends SectionTable
|
|||
byte[] buf = new byte[sepxSize];
|
||||
fileOffset += LittleEndian.SHORT_SIZE;
|
||||
System.arraycopy(documentStream, fileOffset, buf, 0, buf.length);
|
||||
_sections.add(new SEPX(sed, startAt, endAt, tpt, buf));
|
||||
_sections.add(new SEPX(sed, startAt, endAt, charConv, buf));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class CharIsBytes implements CharIndexTranslator {
|
||||
private TextPieceTable tpt;
|
||||
private CharIsBytes(TextPieceTable tpt) {
|
||||
this.tpt = tpt;
|
||||
}
|
||||
|
||||
public int getCharIndex(int bytePos, int startCP) {
|
||||
return bytePos;
|
||||
}
|
||||
public int getCharIndex(int bytePos) {
|
||||
return bytePos;
|
||||
}
|
||||
|
||||
public boolean isIndexInTable(int bytePos) {
|
||||
return tpt.isIndexInTable(bytePos);
|
||||
}
|
||||
public int lookIndexBackward(int bytePos) {
|
||||
return tpt.lookIndexBackward(bytePos);
|
||||
}
|
||||
public int lookIndexForward(int bytePos) {
|
||||
return tpt.lookIndexForward(bytePos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -216,6 +216,8 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||
|
||||
if (bytePos< pieceStart || bytePos > pieceEnd) {
|
||||
toAdd = bytesLength;
|
||||
} else if (bytePos > pieceStart && bytePos < pieceEnd) {
|
||||
toAdd = (bytePos - pieceStart);
|
||||
} else {
|
||||
toAdd = bytesLength - (pieceEnd - bytePos);
|
||||
}
|
||||
|
|
|
@ -37,7 +37,8 @@ public final class SprmIterator
|
|||
|
||||
public boolean hasNext()
|
||||
{
|
||||
return _offset < _grpprl.length;
|
||||
// A Sprm is at least 2 bytes long
|
||||
return _offset < (_grpprl.length-1);
|
||||
}
|
||||
|
||||
public SprmOperation next()
|
||||
|
|
|
@ -20,22 +20,68 @@ package org.apache.poi.hwpf;
|
|||
import junit.framework.Test;
|
||||
import junit.framework.TestSuite;
|
||||
|
||||
import org.apache.poi.hwpf.model.*;
|
||||
import org.apache.poi.hwpf.extractor.TestWordExtractor;
|
||||
import org.apache.poi.hwpf.extractor.TestWordExtractorBugs;
|
||||
import org.apache.poi.hwpf.model.TestCHPBinTable;
|
||||
import org.apache.poi.hwpf.model.TestDocumentProperties;
|
||||
import org.apache.poi.hwpf.model.TestFileInformationBlock;
|
||||
import org.apache.poi.hwpf.model.TestFontTable;
|
||||
import org.apache.poi.hwpf.model.TestListTables;
|
||||
import org.apache.poi.hwpf.model.TestPAPBinTable;
|
||||
import org.apache.poi.hwpf.model.TestPlexOfCps;
|
||||
import org.apache.poi.hwpf.model.TestRevisionMarkAuthorTable;
|
||||
import org.apache.poi.hwpf.model.TestSavedByTable;
|
||||
import org.apache.poi.hwpf.model.TestSectionTable;
|
||||
import org.apache.poi.hwpf.model.TestStyleSheet;
|
||||
import org.apache.poi.hwpf.model.TestTextPieceTable;
|
||||
import org.apache.poi.hwpf.usermodel.TestBug46610;
|
||||
import org.apache.poi.hwpf.usermodel.TestHWPFOldDocument;
|
||||
import org.apache.poi.hwpf.usermodel.TestHeaderStories;
|
||||
import org.apache.poi.hwpf.usermodel.TestPictures;
|
||||
import org.apache.poi.hwpf.usermodel.TestProblems;
|
||||
import org.apache.poi.hwpf.usermodel.TestRange;
|
||||
import org.apache.poi.hwpf.usermodel.TestRangeDelete;
|
||||
import org.apache.poi.hwpf.usermodel.TestRangeInsertion;
|
||||
import org.apache.poi.hwpf.usermodel.TestRangeProperties;
|
||||
import org.apache.poi.hwpf.usermodel.TestRangeReplacement;
|
||||
import org.apache.poi.hwpf.usermodel.TestShapes;
|
||||
|
||||
public final class AllHWPFTests {
|
||||
|
||||
public static Test suite() {
|
||||
TestSuite suite = new TestSuite(AllHWPFTests.class.getName());
|
||||
|
||||
suite.addTestSuite(TestHWPFPictures.class);
|
||||
suite.addTestSuite(TestHWPFRangeParts.class);
|
||||
|
||||
suite.addTestSuite(TestWordExtractor.class);
|
||||
suite.addTestSuite(TestWordExtractorBugs.class);
|
||||
|
||||
suite.addTestSuite(TestCHPBinTable.class);
|
||||
suite.addTestSuite(TestDocumentProperties.class);
|
||||
suite.addTestSuite(TestFileInformationBlock.class);
|
||||
suite.addTestSuite(TestFontTable.class);
|
||||
suite.addTestSuite(TestListTables.class);
|
||||
suite.addTestSuite(TestPAPBinTable.class);
|
||||
suite.addTestSuite(TestPlexOfCps.class);
|
||||
suite.addTestSuite(TestRevisionMarkAuthorTable.class);
|
||||
suite.addTestSuite(TestSavedByTable.class);
|
||||
suite.addTestSuite(TestSectionTable.class);
|
||||
suite.addTestSuite(TestStyleSheet.class);
|
||||
suite.addTestSuite(TestTextPieceTable.class);
|
||||
suite.addTestSuite(TestListTables.class);
|
||||
|
||||
suite.addTestSuite(TestBug46610.class);
|
||||
suite.addTestSuite(TestHeaderStories.class);
|
||||
suite.addTestSuite(TestHWPFOldDocument.class);
|
||||
suite.addTestSuite(TestPictures.class);
|
||||
suite.addTestSuite(TestProblems.class);
|
||||
suite.addTestSuite(TestRange.class);
|
||||
suite.addTestSuite(TestRangeDelete.class);
|
||||
suite.addTestSuite(TestRangeInsertion.class);
|
||||
suite.addTestSuite(TestRangeProperties.class);
|
||||
suite.addTestSuite(TestRangeReplacement.class);
|
||||
suite.addTestSuite(TestShapes.class);
|
||||
|
||||
return suite;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.poi.hwpf;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
|
@ -30,6 +31,14 @@ public class HWPFTestDataSamples {
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
public static HWPFOldDocument openOldSampleFile(String sampleFileName) {
|
||||
try {
|
||||
InputStream is = POIDataSamples.getDocumentInstance().openResourceAsStream(sampleFileName);
|
||||
return new HWPFOldDocument(new POIFSFileSystem(is));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Writes a spreadsheet to a <tt>ByteArrayOutputStream</tt> and reads it back
|
||||
* from a <tt>ByteArrayInputStream</tt>.<p/>
|
||||
|
|
|
@ -52,7 +52,7 @@ public final class TestWordExtractor extends TestCase {
|
|||
|
||||
// Well behaved document
|
||||
private WordExtractor extractor;
|
||||
// Corrupted document - can't do paragraph based stuff
|
||||
// Slightly iffy document
|
||||
private WordExtractor extractor2;
|
||||
// A word doc embeded in an excel file
|
||||
private String filename3;
|
||||
|
@ -93,8 +93,11 @@ public final class TestWordExtractor extends TestCase {
|
|||
assertEquals(p_text1[i], text[i]);
|
||||
}
|
||||
|
||||
// On second one, should fall back
|
||||
assertEquals(1, extractor2.getParagraphText().length);
|
||||
// Lots of paragraphs with only a few lines in them
|
||||
assertEquals(24, extractor2.getParagraphText().length);
|
||||
assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
|
||||
assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
|
||||
assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -103,8 +106,11 @@ public final class TestWordExtractor extends TestCase {
|
|||
public void testGetText() {
|
||||
assertEquals(p_text1_block, extractor.getText());
|
||||
|
||||
// On second one, should fall back to text piece
|
||||
assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
|
||||
// For the 2nd, should give similar answers for
|
||||
// the two methods, differing only in line endings
|
||||
assertEquals(
|
||||
extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
|
||||
extractor2.getText().replaceAll("[\\r\\n]", ""));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import org.apache.poi.OldFileFormatException;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||
import org.apache.poi.hwpf.HWPFTestCase;
|
||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||
|
||||
/**
|
||||
* Tests for Word 6 and Word 95 support
|
||||
*/
|
||||
public final class TestHWPFOldDocument extends HWPFTestCase {
|
||||
/**
|
||||
* Test a simple Word 6 document
|
||||
*/
|
||||
public void testWord6() throws Exception {
|
||||
// Can't open as HWPFDocument
|
||||
try {
|
||||
HWPFTestDataSamples.openSampleFile("Word6.doc");
|
||||
fail("Shouldn't be openable");
|
||||
} catch(OldFileFormatException e) {}
|
||||
|
||||
// Open
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
|
||||
|
||||
// Check
|
||||
assertEquals(1, doc.getRange().numSections());
|
||||
assertEquals(1, doc.getRange().numParagraphs());
|
||||
assertEquals(1, doc.getRange().numCharacterRuns());
|
||||
|
||||
assertEquals(
|
||||
"The quick brown fox jumps over the lazy dog\r",
|
||||
doc.getRange().getParagraph(0).text()
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test a simple Word 95 document
|
||||
*/
|
||||
public void testWord95() throws Exception {
|
||||
// Can't open as HWPFDocument
|
||||
try {
|
||||
HWPFTestDataSamples.openSampleFile("Word95.doc");
|
||||
fail("Shouldn't be openable");
|
||||
} catch(OldFileFormatException e) {}
|
||||
|
||||
// Open
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word95.doc");
|
||||
|
||||
// Check
|
||||
assertEquals(1, doc.getRange().numSections());
|
||||
assertEquals(7, doc.getRange().numParagraphs());
|
||||
|
||||
assertEquals(
|
||||
"The quick brown fox jumps over the lazy dog\r",
|
||||
doc.getRange().getParagraph(0).text()
|
||||
);
|
||||
assertEquals("\r", doc.getRange().getParagraph(1).text());
|
||||
assertEquals(
|
||||
"Paragraph 2\r",
|
||||
doc.getRange().getParagraph(2).text()
|
||||
);
|
||||
assertEquals("\r", doc.getRange().getParagraph(3).text());
|
||||
assertEquals(
|
||||
"Paragraph 3. Has some RED text and some " +
|
||||
"BLUE BOLD text in it.\r",
|
||||
doc.getRange().getParagraph(4).text()
|
||||
);
|
||||
assertEquals("\r", doc.getRange().getParagraph(5).text());
|
||||
assertEquals(
|
||||
"Last (4th) paragraph.\r",
|
||||
doc.getRange().getParagraph(6).text()
|
||||
);
|
||||
|
||||
assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
|
||||
assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
|
||||
assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
|
||||
assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
|
||||
// Normal, red, normal, blue+bold, normal
|
||||
assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
|
||||
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
|
||||
// Normal, superscript for 4th, normal
|
||||
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test a word document that has sections,
|
||||
* as well as the usual paragraph stuff.
|
||||
*/
|
||||
public void testWord6Sections() throws Exception {
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
|
||||
|
||||
assertEquals(3, doc.getRange().numSections());
|
||||
assertEquals(6, doc.getRange().numParagraphs());
|
||||
|
||||
assertEquals(
|
||||
"This is a test.\r",
|
||||
doc.getRange().getParagraph(0).text()
|
||||
);
|
||||
assertEquals("\r", doc.getRange().getParagraph(1).text());
|
||||
assertEquals("\u000c", doc.getRange().getParagraph(2).text()); // Section line?
|
||||
assertEquals("This is a new section.\r", doc.getRange().getParagraph(3).text());
|
||||
assertEquals("\u000c", doc.getRange().getParagraph(4).text()); // Section line?
|
||||
assertEquals("\r", doc.getRange().getParagraph(5).text());
|
||||
}
|
||||
}
|
Binary file not shown.
Loading…
Reference in New Issue