61470 -- add extraction of content within ruby elements; allow users to concatenate or not concatenate phonetic strings. Default is to concatenate.

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1806712 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-08-30 16:29:52 +00:00
parent d68912db25
commit d5b3bd57af
4 changed files with 153 additions and 52 deletions

View File

@ -33,6 +33,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRelation; import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFSDT; import org.apache.poi.xwpf.usermodel.XWPFSDT;
import org.apache.poi.xwpf.usermodel.XWPFSDTCell; import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTable;
@ -53,6 +54,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
private XWPFDocument document; private XWPFDocument document;
private boolean fetchHyperlinks = false; private boolean fetchHyperlinks = false;
private boolean concatenatePhoneticRuns = true;
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
this(new XWPFDocument(container)); this(new XWPFDocument(container));
@ -86,6 +88,14 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
fetchHyperlinks = fetch; fetchHyperlinks = fetch;
} }
/**
* Should we concatenate phonetic runs in extraction. Default is <code>true</code>
* @param concatenatePhoneticRuns
*/
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
}
public String getText() { public String getText() {
StringBuffer text = new StringBuffer(); StringBuffer text = new StringBuffer();
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
@ -130,7 +140,11 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
for (IRunElement run : paragraph.getRuns()) { for (IRunElement run : paragraph.getRuns()) {
if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
text.append(((XWPFRun)run).text());
} else {
text.append(run); text.append(run);
}
if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) { if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document); XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document);
if (link != null) if (link != null)

View File

@ -68,6 +68,8 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRuby;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRubyContent;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
@ -1042,11 +1044,16 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
} }
/** /**
* Returns the string version of the text * Returns the string version of the text and the phonetic string
*/ */
public String toString() { public String toString() {
String phonetic = getPhonetic();
if (phonetic.length() > 0) {
return text() +" ("+phonetic.toString()+")";
} else {
return text(); return text();
} }
}
/** /**
* Returns the string version of the text, with tabs and * Returns the string version of the text, with tabs and
@ -1061,6 +1068,77 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
c.selectPath("./*"); c.selectPath("./*");
while (c.toNextSelection()) { while (c.toNextSelection()) {
XmlObject o = c.getObject(); XmlObject o = c.getObject();
if (o instanceof CTRuby) {
handleRuby(o, text, false);
continue;
}
_getText(o, text);
}
c.dispose();
return text.toString();
}
/**
*
* @return the phonetic (ruby) string associated with this run or an empty String if none exists
*/
public String getPhonetic() {
StringBuffer text = new StringBuffer();
// Grab the text and tabs of the text run
// Do so in a way that preserves the ordering
XmlCursor c = run.newCursor();
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTRuby) {
handleRuby(o, text, true);
}
}
c.dispose();
return text.toString();
}
/**
*
* @param rubyObj rubyobject
* @param text buffer to which to append the content
* @param extractPhonetic extract the phonetic (rt) component or the base component
*/
private void handleRuby(XmlObject rubyObj, StringBuffer text, boolean extractPhonetic) {
XmlCursor c = rubyObj.newCursor();
//according to the spec, a ruby object
//has the phonetic (rt) first, then the actual text (base)
//second.
c.selectPath(".//*");
boolean inRT = false;
boolean inBase = false;
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTRubyContent) {
String tagName = o.getDomNode().getNodeName();
if ("w:rt".equals(tagName)) {
inRT = true;
} else if ("w:rubyBase".equals(tagName)) {
inRT = false;
inBase = true;
}
} else {
if (extractPhonetic && inRT) {
_getText(o, text);
} else if (! extractPhonetic && inBase) {
_getText(o, text);
}
}
}
c.dispose();
}
private void _getText(XmlObject o, StringBuffer text) {
if (o instanceof CTText) { if (o instanceof CTText) {
String tagName = o.getDomNode().getNodeName(); String tagName = o.getDomNode().getNodeName();
// Field Codes (w:instrText, defined in spec sec. 17.16.23) // Field Codes (w:instrText, defined in spec sec. 17.16.23)
@ -1116,16 +1194,13 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
"[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]"; "[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
text.append(footnoteRef); text.append(footnoteRef);
} }
}
c.dispose();
// Any picture text? // Any picture text?
if (pictureText != null && pictureText.length() > 0) { if (pictureText != null && pictureText.length() > 0) {
text.append("\n").append(pictureText); text.append("\n").append(pictureText);
} }
return text.toString();
} }
/** /**

View File

@ -421,4 +421,16 @@ public class TestXWPFWordExtractor extends TestCase {
extractor.getText()); extractor.getText());
extractor.close(); extractor.close();
} }
public void testPhonetic() throws IOException {
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("61470.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
//expect: baseText (phoneticText)
assertEquals("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.getText().trim());
extractor.close();
extractor = new XWPFWordExtractor(doc);
extractor.setConcatenatePhoneticRuns(false);
assertEquals("\u6771\u4EAC", extractor.getText().trim());
}
} }

Binary file not shown.