Bug 55966: Include content control text in word extraction also if it is part of a paragraph

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1875802 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dominik Stadler 2020-03-28 09:24:38 +00:00
parent 456dc4d368
commit da2afc19e2
3 changed files with 32 additions and 14 deletions

View File

@ -90,7 +90,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
/** /**
* Should we concatenate phonetic runs in extraction. Default is <code>true</code> * Should we concatenate phonetic runs in extraction. Default is <code>true</code>
* @param concatenatePhoneticRuns * @param concatenatePhoneticRuns If phonetic runs should be concatenated
*/ */
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) { public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
this.concatenatePhoneticRuns = concatenatePhoneticRuns; this.concatenatePhoneticRuns = concatenatePhoneticRuns;
@ -138,9 +138,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
extractHeaders(text, headerFooterPolicy); extractHeaders(text, headerFooterPolicy);
} }
for (IRunElement run : paragraph.getIRuns()) {
for (IRunElement run : paragraph.getRuns()) { if (run instanceof XWPFSDT) {
if (! concatenatePhoneticRuns && run instanceof XWPFRun) { text.append(((XWPFSDT) run).getContent().getText());
} else if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
text.append(((XWPFRun)run).text()); text.append(((XWPFRun)run).text());
} else { } else {
text.append(run); text.append(run);

View File

@ -17,6 +17,16 @@
package org.apache.poi.xwpf.extractor; package org.apache.poi.xwpf.extractor;
import org.apache.poi.util.StringUtil;
import org.apache.poi.xwpf.XWPFTestDataSamples;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.apache.poi.POITestCase.assertContains; import static org.apache.poi.POITestCase.assertContains;
import static org.apache.poi.POITestCase.assertEndsWith; import static org.apache.poi.POITestCase.assertEndsWith;
import static org.apache.poi.POITestCase.assertNotContained; import static org.apache.poi.POITestCase.assertNotContained;
@ -25,16 +35,6 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.poi.util.StringUtil;
import org.apache.poi.xwpf.XWPFTestDataSamples;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
/** /**
* Tests for HXFWordExtractor * Tests for HXFWordExtractor
*/ */
@ -460,4 +460,21 @@ public class TestXWPFWordExtractor {
assertContains(txt, "footer 1"); assertContains(txt, "footer 1");
} }
} }
@Test
public void bug55966() throws IOException {
try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx")) {
String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" +
"line\n" +
"\n" +
"Content control that is the entire paragraph\n";
XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc);
String actual = extractedDoc.getText();
extractedDoc.close();
assertEquals(expected, actual);
}
}
} }

Binary file not shown.