Bug 55966: Include content control text in word extraction also if it is part of a paragraph

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1875802 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2020-03-28 09:24:38 +00:00
parent 456dc4d368
commit da2afc19e2
3 changed files with 32 additions and 14 deletions

View File

@ -90,7 +90,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
/**
* Should we concatenate phonetic runs in extraction. Default is <code>true</code>
* @param concatenatePhoneticRuns
* @param concatenatePhoneticRuns If phonetic runs should be concatenated
*/
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
@ -138,9 +138,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
extractHeaders(text, headerFooterPolicy);
}
for (IRunElement run : paragraph.getRuns()) {
if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
for (IRunElement run : paragraph.getIRuns()) {
if (run instanceof XWPFSDT) {
text.append(((XWPFSDT) run).getContent().getText());
} else if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
text.append(((XWPFRun)run).text());
} else {
text.append(run);

View File

@ -17,6 +17,16 @@
package org.apache.poi.xwpf.extractor;
import org.apache.poi.util.StringUtil;
import org.apache.poi.xwpf.XWPFTestDataSamples;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.apache.poi.POITestCase.assertContains;
import static org.apache.poi.POITestCase.assertEndsWith;
import static org.apache.poi.POITestCase.assertNotContained;
@ -25,16 +35,6 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.poi.util.StringUtil;
import org.apache.poi.xwpf.XWPFTestDataSamples;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
/**
* Tests for HXFWordExtractor
*/
@ -460,4 +460,21 @@ public class TestXWPFWordExtractor {
assertContains(txt, "footer 1");
}
}
@Test
public void bug55966() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx")) {
String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" +
"line\n" +
"\n" +
"Content control that is the entire paragraph\n";
XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc);
String actual = extractedDoc.getText();
extractedDoc.close();
assertEquals(expected, actual);
}
}
}

Binary file not shown.