61470 -- add extraction of content within ruby elements; allow users to concatenate or not concatenate phonetic strings. Default is to concatenate.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1806712 13f79535-47bb-0310-9956-ffa450edef68
2017-08-30 16:29:52 +00:00 · 2017-08-30 16:29:52 +00:00 · d5b3bd57af
parent d68912db25
commit d5b3bd57af
4 changed files with 153 additions and 52 deletions
--- a/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
@ -33,6 +33,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.poi.xwpf.usermodel.XWPFRun;
 import org.apache.poi.xwpf.usermodel.XWPFSDT;
 import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
 import org.apache.poi.xwpf.usermodel.XWPFTable;
@ -53,6 +54,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
    private XWPFDocument document;
    private boolean fetchHyperlinks = false;
    private boolean concatenatePhoneticRuns = true;
    public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
        this(new XWPFDocument(container));
@ -86,6 +88,14 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
        fetchHyperlinks = fetch;
    }
    /**
     * Should we concatenate phonetic runs in extraction.  Default is <code>true</code>
     * @param concatenatePhoneticRuns
     */
    public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
        this.concatenatePhoneticRuns = concatenatePhoneticRuns;
    }
    public String getText() {
        StringBuffer text = new StringBuffer();
        XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
@ -130,7 +140,11 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
        for (IRunElement run : paragraph.getRuns()) {
            if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
                text.append(((XWPFRun)run).text());
            } else {
                text.append(run);
            }
            if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
                XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document);
                if (link != null)
--- a/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java
+++ b/src/ooxml/java/org/apache/poi/xwpf/usermodel/XWPFRun.java
@ -68,6 +68,8 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRuby;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRubyContent;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
@ -1042,11 +1044,16 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
    }
    /**
-     * Returns the string version of the text
+     * Returns the string version of the text and the phonetic string
     */
    public String toString() {
        String phonetic = getPhonetic();
        if (phonetic.length() > 0) {
            return text() +" ("+phonetic.toString()+")";
        } else {
            return text();
        }
    }
    /**
     * Returns the string version of the text, with tabs and
@ -1061,6 +1068,77 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
        c.selectPath("./*");
        while (c.toNextSelection()) {
            XmlObject o = c.getObject();
            if (o instanceof CTRuby) {
                handleRuby(o, text, false);
                continue;
            }
            _getText(o, text);
        }
        c.dispose();
        return text.toString();
    }
    /**
     *
     * @return the phonetic (ruby) string associated with this run or an empty String if none exists
     */
    public String getPhonetic() {
        StringBuffer text = new StringBuffer();
        // Grab the text and tabs of the text run
        // Do so in a way that preserves the ordering
        XmlCursor c = run.newCursor();
        c.selectPath("./*");
        while (c.toNextSelection()) {
            XmlObject o = c.getObject();
            if (o instanceof CTRuby) {
                handleRuby(o, text, true);
            }
        }
        c.dispose();
        return text.toString();
    }
    /**
     *
     * @param rubyObj rubyobject
     * @param text buffer to which to append the content
     * @param extractPhonetic extract the phonetic (rt) component or the base component
     */
    private void handleRuby(XmlObject rubyObj, StringBuffer text, boolean extractPhonetic) {
        XmlCursor c = rubyObj.newCursor();
        //according to the spec, a ruby object
        //has the phonetic (rt) first, then the actual text (base)
        //second.
        c.selectPath(".//*");
        boolean inRT = false;
        boolean inBase = false;
        while (c.toNextSelection()) {
            XmlObject o = c.getObject();
            if (o instanceof CTRubyContent) {
                String tagName = o.getDomNode().getNodeName();
                if ("w:rt".equals(tagName)) {
                    inRT = true;
                } else if ("w:rubyBase".equals(tagName)) {
                    inRT = false;
                    inBase = true;
                }
            } else {
                if (extractPhonetic && inRT) {
                    _getText(o, text);
                } else if (! extractPhonetic && inBase) {
                    _getText(o, text);
                }
            }
        }
        c.dispose();
    }
    private void _getText(XmlObject o, StringBuffer text) {
        if (o instanceof CTText) {
            String tagName = o.getDomNode().getNodeName();
            // Field Codes (w:instrText, defined in spec sec. 17.16.23)
@ -1116,16 +1194,13 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
                    "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
            text.append(footnoteRef);
        }
        }
        c.dispose();
        // Any picture text?
        if (pictureText != null && pictureText.length() > 0) {
            text.append("\n").append(pictureText);
        }
        return text.toString();
    }
    /**
--- a/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
+++ b/src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
@ -421,4 +421,16 @@ public class TestXWPFWordExtractor extends TestCase {
                extractor.getText());
        extractor.close();
    }
    public void testPhonetic() throws IOException {
        XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("61470.docx");
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        //expect: baseText (phoneticText)
        assertEquals("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.getText().trim());
        extractor.close();
        extractor = new XWPFWordExtractor(doc);
        extractor.setConcatenatePhoneticRuns(false);
        assertEquals("\u6771\u4EAC", extractor.getText().trim());
    }
 }
--- a/test-data/document/61470.docx
+++ b/test-data/document/61470.docx