diff --git a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java index cacac24323..4b61c09d19 100644 --- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java +++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java @@ -90,7 +90,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { /** * Should we concatenate phonetic runs in extraction. Default is true - * @param concatenatePhoneticRuns + * @param concatenatePhoneticRuns If phonetic runs should be concatenated */ public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) { this.concatenatePhoneticRuns = concatenatePhoneticRuns; @@ -138,9 +138,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor { extractHeaders(text, headerFooterPolicy); } - - for (IRunElement run : paragraph.getRuns()) { - if (! concatenatePhoneticRuns && run instanceof XWPFRun) { + for (IRunElement run : paragraph.getIRuns()) { + if (run instanceof XWPFSDT) { + text.append(((XWPFSDT) run).getContent().getText()); + } else if (! concatenatePhoneticRuns && run instanceof XWPFRun) { text.append(((XWPFRun)run).text()); } else { text.append(run); diff --git a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java index ecab432c10..8a14b25ba0 100644 --- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java +++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java @@ -17,6 +17,16 @@ package org.apache.poi.xwpf.extractor; +import org.apache.poi.util.StringUtil; +import org.apache.poi.xwpf.XWPFTestDataSamples; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.junit.Test; + +import java.io.IOException; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import static org.apache.poi.POITestCase.assertContains; import static org.apache.poi.POITestCase.assertEndsWith; import static org.apache.poi.POITestCase.assertNotContained; @@ -25,16 +35,6 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import java.io.IOException; -import java.util.Locale; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.poi.util.StringUtil; -import org.apache.poi.xwpf.XWPFTestDataSamples; -import org.apache.poi.xwpf.usermodel.XWPFDocument; -import org.junit.Test; - /** * Tests for HXFWordExtractor */ @@ -460,4 +460,21 @@ public class TestXWPFWordExtractor { assertContains(txt, "footer 1"); } } + + @Test + public void bug55966() throws IOException { + try (XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("55966.docx")) { + String expected = "Content control within a paragraph is here text content from within a paragraph second control with a new\n" + + "line\n" + + "\n" + + "Content control that is the entire paragraph\n"; + + XWPFWordExtractor extractedDoc = new XWPFWordExtractor(doc); + + String actual = extractedDoc.getText(); + + extractedDoc.close(); + assertEquals(expected, actual); + } + } } diff --git a/test-data/document/55966.docx b/test-data/document/55966.docx new file mode 100644 index 0000000000..28543c9702 Binary files /dev/null and b/test-data/document/55966.docx differ