mirror of https://github.com/apache/poi.git
61470 -- add extraction of content within ruby elements; allow users to concatenate or not concatenate phonetic strings. Default is to concatenate.
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1806712 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d68912db25
commit
d5b3bd57af
|
@ -33,6 +33,7 @@ import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
|
import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
||||||
|
import org.apache.poi.xwpf.usermodel.XWPFRun;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFSDT;
|
import org.apache.poi.xwpf.usermodel.XWPFSDT;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
|
import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
|
||||||
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
||||||
|
@ -53,6 +54,7 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||||
|
|
||||||
private XWPFDocument document;
|
private XWPFDocument document;
|
||||||
private boolean fetchHyperlinks = false;
|
private boolean fetchHyperlinks = false;
|
||||||
|
private boolean concatenatePhoneticRuns = true;
|
||||||
|
|
||||||
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
|
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
|
||||||
this(new XWPFDocument(container));
|
this(new XWPFDocument(container));
|
||||||
|
@ -86,6 +88,14 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||||
fetchHyperlinks = fetch;
|
fetchHyperlinks = fetch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should we concatenate phonetic runs in extraction. Default is <code>true</code>
|
||||||
|
* @param concatenatePhoneticRuns
|
||||||
|
*/
|
||||||
|
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
|
||||||
|
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
|
||||||
|
}
|
||||||
|
|
||||||
public String getText() {
|
public String getText() {
|
||||||
StringBuffer text = new StringBuffer();
|
StringBuffer text = new StringBuffer();
|
||||||
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
|
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
|
||||||
|
@ -130,7 +140,11 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||||
|
|
||||||
|
|
||||||
for (IRunElement run : paragraph.getRuns()) {
|
for (IRunElement run : paragraph.getRuns()) {
|
||||||
|
if (! concatenatePhoneticRuns && run instanceof XWPFRun) {
|
||||||
|
text.append(((XWPFRun)run).text());
|
||||||
|
} else {
|
||||||
text.append(run);
|
text.append(run);
|
||||||
|
}
|
||||||
if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
|
if (run instanceof XWPFHyperlinkRun && fetchHyperlinks) {
|
||||||
XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document);
|
XWPFHyperlink link = ((XWPFHyperlinkRun) run).getHyperlink(document);
|
||||||
if (link != null)
|
if (link != null)
|
||||||
|
|
|
@ -68,6 +68,8 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRuby;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRubyContent;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedHpsMeasure;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSignedTwipsMeasure;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
||||||
|
@ -1042,11 +1044,16 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the string version of the text
|
* Returns the string version of the text and the phonetic string
|
||||||
*/
|
*/
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
String phonetic = getPhonetic();
|
||||||
|
if (phonetic.length() > 0) {
|
||||||
|
return text() +" ("+phonetic.toString()+")";
|
||||||
|
} else {
|
||||||
return text();
|
return text();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the string version of the text, with tabs and
|
* Returns the string version of the text, with tabs and
|
||||||
|
@ -1061,6 +1068,77 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
|
||||||
c.selectPath("./*");
|
c.selectPath("./*");
|
||||||
while (c.toNextSelection()) {
|
while (c.toNextSelection()) {
|
||||||
XmlObject o = c.getObject();
|
XmlObject o = c.getObject();
|
||||||
|
if (o instanceof CTRuby) {
|
||||||
|
handleRuby(o, text, false);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
_getText(o, text);
|
||||||
|
}
|
||||||
|
c.dispose();
|
||||||
|
return text.toString();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return the phonetic (ruby) string associated with this run or an empty String if none exists
|
||||||
|
*/
|
||||||
|
public String getPhonetic() {
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
// Grab the text and tabs of the text run
|
||||||
|
// Do so in a way that preserves the ordering
|
||||||
|
XmlCursor c = run.newCursor();
|
||||||
|
c.selectPath("./*");
|
||||||
|
while (c.toNextSelection()) {
|
||||||
|
XmlObject o = c.getObject();
|
||||||
|
if (o instanceof CTRuby) {
|
||||||
|
handleRuby(o, text, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c.dispose();
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param rubyObj rubyobject
|
||||||
|
* @param text buffer to which to append the content
|
||||||
|
* @param extractPhonetic extract the phonetic (rt) component or the base component
|
||||||
|
*/
|
||||||
|
private void handleRuby(XmlObject rubyObj, StringBuffer text, boolean extractPhonetic) {
|
||||||
|
XmlCursor c = rubyObj.newCursor();
|
||||||
|
|
||||||
|
//according to the spec, a ruby object
|
||||||
|
//has the phonetic (rt) first, then the actual text (base)
|
||||||
|
//second.
|
||||||
|
|
||||||
|
c.selectPath(".//*");
|
||||||
|
boolean inRT = false;
|
||||||
|
boolean inBase = false;
|
||||||
|
while (c.toNextSelection()) {
|
||||||
|
XmlObject o = c.getObject();
|
||||||
|
if (o instanceof CTRubyContent) {
|
||||||
|
String tagName = o.getDomNode().getNodeName();
|
||||||
|
if ("w:rt".equals(tagName)) {
|
||||||
|
inRT = true;
|
||||||
|
} else if ("w:rubyBase".equals(tagName)) {
|
||||||
|
inRT = false;
|
||||||
|
inBase = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (extractPhonetic && inRT) {
|
||||||
|
_getText(o, text);
|
||||||
|
} else if (! extractPhonetic && inBase) {
|
||||||
|
_getText(o, text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c.dispose();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void _getText(XmlObject o, StringBuffer text) {
|
||||||
|
|
||||||
if (o instanceof CTText) {
|
if (o instanceof CTText) {
|
||||||
String tagName = o.getDomNode().getNodeName();
|
String tagName = o.getDomNode().getNodeName();
|
||||||
// Field Codes (w:instrText, defined in spec sec. 17.16.23)
|
// Field Codes (w:instrText, defined in spec sec. 17.16.23)
|
||||||
|
@ -1116,16 +1194,13 @@ public class XWPFRun implements ISDTContents, IRunElement, CharacterRun {
|
||||||
"[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
|
"[footnoteRef:" + ftn.getId().intValue() + "]" : "[endnoteRef:" + ftn.getId().intValue() + "]";
|
||||||
text.append(footnoteRef);
|
text.append(footnoteRef);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
c.dispose();
|
|
||||||
|
|
||||||
// Any picture text?
|
// Any picture text?
|
||||||
if (pictureText != null && pictureText.length() > 0) {
|
if (pictureText != null && pictureText.length() > 0) {
|
||||||
text.append("\n").append(pictureText);
|
text.append("\n").append(pictureText);
|
||||||
}
|
}
|
||||||
|
|
||||||
return text.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -421,4 +421,16 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||||
extractor.getText());
|
extractor.getText());
|
||||||
extractor.close();
|
extractor.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPhonetic() throws IOException {
|
||||||
|
XWPFDocument doc = XWPFTestDataSamples.openSampleDocument("61470.docx");
|
||||||
|
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
|
||||||
|
//expect: baseText (phoneticText)
|
||||||
|
assertEquals("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.getText().trim());
|
||||||
|
extractor.close();
|
||||||
|
extractor = new XWPFWordExtractor(doc);
|
||||||
|
extractor.setConcatenatePhoneticRuns(false);
|
||||||
|
assertEquals("\u6771\u4EAC", extractor.getText().trim());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue