Fix bug #49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the XSD is silly and maps them to CTEmpty

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@948199 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-05-25 20:31:42 +00:00
parent 2836dd45d6
commit 216ecd1407
4 changed files with 32 additions and 0 deletions

View File

@ -34,6 +34,7 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="fix">49189 - Detect w:tab and w:cr entries in XWPF paragraphs, even when the XSD is silly and maps them to CTEmpty</action>
<action dev="POI-DEVELOPERS" type="fix">49273 - Correct handling for Font Character Sets with indicies greater than 127</action>
<action dev="POI-DEVELOPERS" type="add">49334 - Track the ValueRangeRecords of charts in HSSFChart, to allow the basic axis operations</action>
<action dev="POI-DEVELOPERS" type="add">49242 - Track the LinkDataRecords of charts in HSSFChart</action>

View File

@ -24,6 +24,7 @@ import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.apache.poi.util.Internal;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFtnEdnRef;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTInd;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTJc;
@ -108,6 +109,18 @@ public class XWPFParagraph {
if (o instanceof CTPTab) {
text.append("\t");
}
if (o instanceof CTEmpty) {
// Some inline text elements get returned not as
// themselves, but as CTEmpty, owing to some odd
// definitions around line 5642 of the XSDs
String tagName = o.getDomNode().getNodeName();
if ("w:tab".equals(tagName)) {
text.append("\t");
}
if ("w:cr".equals(tagName)) {
text.append("\n");
}
}
//got a reference to a footnote
if (o instanceof CTFtnEdnRef) {
CTFtnEdnRef ftn = (CTFtnEdnRef) o;

View File

@ -219,4 +219,22 @@ public class TestXWPFWordExtractor extends TestCase {
assertTrue(extractor.getText().contains("2008"));
assertTrue(extractor.getText().contains("(120 "));
}
/**
* Test that we handle things like tabs and
* carriage returns properly in the text that
* we're extracting (bug #49189)
*/
public void testDocTabs() {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("WithTabs.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
// Check bits
assertTrue(extractor.getText().contains("a"));
assertTrue(extractor.getText().contains("\t"));
assertTrue(extractor.getText().contains("b"));
// Now check the first paragraph in total
assertTrue(extractor.getText().contains("a\tb\n"));
}
}

Binary file not shown.