Improved hyperlink and comment fetching for xwpf text extraction, based on the patch from bug #44821

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@651979 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-04-27 16:36:51 +00:00
parent 24ba833f03
commit 61405ba81f
11 changed files with 560 additions and 65 deletions

View File

@ -17,6 +17,9 @@
package org.apache.poi.xwpf;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Iterator;
import org.apache.poi.POIXMLDocument;
import org.apache.xmlbeans.XmlException;
@ -24,12 +27,22 @@ import org.openxml4j.exceptions.InvalidFormatException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxml4j.opc.PackageRelationship;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTComment;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CommentsDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFComment;
import org.apache.poi.xwpf.usermodel.XWPFTable;
/**
* Experimental class to do low level processing
@ -48,15 +61,59 @@ public class XWPFDocument extends POIXMLDocument {
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
public static final String COMMENT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments";
private DocumentDocument wordDoc;
protected List<XWPFComment> comments;
protected List<XWPFHyperlink> hyperlinks;
protected List<XWPFParagraph> paragraphs;
protected List<XWPFTable> tables;
public XWPFDocument(Package container) throws OpenXML4JException, IOException, XmlException {
super(container);
hyperlinks = new LinkedList<XWPFHyperlink>();
comments = new LinkedList<XWPFComment>();
paragraphs = new LinkedList<XWPFParagraph>();
tables= new LinkedList<XWPFTable>();
wordDoc =
DocumentDocument.Factory.parse(getCorePart().getInputStream());
// filling paragraph list
for (CTP p : getDocumentBody().getPArray()) {
paragraphs.add(new XWPFParagraph(p, this));
}
// Get the hyperlinks
// TODO: make me optional/separated in private function
try {
Iterator <PackageRelationship> relIter =
getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE).iterator();
while(relIter.hasNext()) {
PackageRelationship rel = relIter.next();
hyperlinks.add(new XWPFHyperlink(rel.getId(), rel.getTargetURI().toString()));
}
} catch(Exception e) {
throw new OpenXML4JException(e.getLocalizedMessage());
}
// Get the comments, if there are any
PackageRelationshipCollection commentsRel = getCmntRelations();
if(commentsRel != null && commentsRel.size() > 0) {
PackagePart commentsPart = getTargetPart(commentsRel.getRelationship(0));
CommentsDocument cmntdoc = CommentsDocument.Factory.parse(commentsPart.getInputStream());
for(CTComment ctcomment : cmntdoc.getComments().getCommentArray())
{
comments.add(new XWPFComment(ctcomment));
}
for(CTTbl table : getDocumentBody().getTblArray())
{
tables.add(new XWPFTable(table));
}
}
}
/**
@ -66,6 +123,42 @@ public class XWPFDocument extends POIXMLDocument {
return wordDoc.getDocument();
}
public Iterator<XWPFParagraph> getParagraphsIterator()
{
return paragraphs.iterator();
}
public Iterator<XWPFTable> getTablesIterator()
{
return tables.iterator();
}
public XWPFHyperlink getHyperlinkByID(String id)
{
Iterator<XWPFHyperlink> iter = hyperlinks.iterator();
while(iter.hasNext())
{
XWPFHyperlink link = iter.next();
if(link.getId().equals(id))
return link;
}
return null;
}
public XWPFComment getCommentByID(String id)
{
Iterator<XWPFComment> iter = comments.iterator();
while(iter.hasNext())
{
XWPFComment comment = iter.next();
if(comment.getId().equals(id))
return comment;
}
return null;
}
/**
* Returns the low level body of the document
*/
@ -91,18 +184,10 @@ public class XWPFDocument extends POIXMLDocument {
StylesDocument.Factory.parse(parts[0].getInputStream());
return sd.getStyles();
}
/**
* Returns all the hyperlink relations for the file.
* You'll generally want to get the target to get
* the destination of the hyperlink
*/
public PackageRelationshipCollection getHyperlinks() {
try {
return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE);
} catch(InvalidFormatException e) {
// Should never happen
throw new IllegalStateException(e);
}
protected PackageRelationshipCollection getCmntRelations() throws InvalidFormatException
{
return getCorePart().getRelationshipsByType(COMMENT_RELATION_TYPE);
}
}

View File

@ -17,19 +17,19 @@
package org.apache.poi.xwpf.extractor;
import java.io.IOException;
import java.util.Iterator;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.xwpf.XWPFDocument;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageRelationship;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
/**
* Helper class to extract text from an OOXML Word file
@ -45,6 +45,15 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
super(document);
this.document = document;
}
/**
* Should we also fetch the hyperlinks, when fetching
* the text content? Default is to only output the
* hyperlink label, and not the contents
*/
public void setFetchHyperlinks(boolean fetch) {
fetchHyperlinks = fetch;
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
@ -59,56 +68,21 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
System.out.println(extractor.getText());
}
/**
* Should we also fetch the hyperlinks, when fetching
* the text content? Default is to only output the
* hyperlink label, and not the contents
*/
public void setFetchHyperlinks(boolean fetch) {
fetchHyperlinks = fetch;
}
public String getText() {
CTBody body = document.getDocumentBody();
StringBuffer text = new StringBuffer();
// Loop over paragraphs
CTP[] ps = body.getPArray();
for (int i = 0; i < ps.length; i++) {
// Loop over ranges and hyperlinks
// TODO - properly intersperce ranges and hyperlinks
CTR[] rs = ps[i].getRArray();
for(int j = 0; j < rs.length; j++) {
// Loop over text runs
CTText[] texts = rs[j].getTArray();
for (int k = 0; k < texts.length; k++) {
text.append(
texts[k].getStringValue()
);
}
}
CTHyperlink[] hls = ps[i].getHyperlinkArray();
for(CTHyperlink hl : hls) {
for(CTR r : hl.getRArray()) {
for(CTText txt : r.getTArray()) {
text.append(txt.getStringValue());
}
}
if(fetchHyperlinks) {
String id = hl.getId();
if(id != null) {
PackageRelationship hlRel =
document.getHyperlinks().getRelationshipByID(id);
if(hlRel != null) {
text.append(" <" + hlRel.getTargetURI().toString() + ">");
}
}
}
}
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
while(i.hasNext()) {
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
new XWPFHyperlinkDecorator(i.next(), null, fetchHyperlinks));
text.append(decorator.getText()+"\n");
}
// New line after each paragraph.
text.append("\n");
Iterator<XWPFTable> j = document.getTablesIterator();
while(j.hasNext())
{
text.append(j.next().getText()+"\n");
}
return text.toString();

View File

@ -0,0 +1,37 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.model;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
/**
* Base class for XWPF paragraphs
*
* @author Yury Batrakov (batrakov at gmail.com)
*
*/
public class XMLParagraph {
protected CTP paragraph;
public XMLParagraph(CTP paragraph) {
this.paragraph = paragraph;
}
public CTP getCTP() {
return paragraph;
}
}

View File

@ -0,0 +1,52 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.model;
import org.apache.poi.xwpf.usermodel.XWPFComment;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTMarkupRange;
/**
* Decorator class for XWPFParagraph allowing to add comments
* found in paragraph to its text
*
* @author Yury Batrakov (batrakov at gmail.com)
*
*/
public class XWPFCommentsDecorator extends XWPFParagraphDecorator {
private StringBuffer commentText;
public XWPFCommentsDecorator(XWPFParagraphDecorator nextDecorator) {
this(nextDecorator.paragraph, nextDecorator);
}
public XWPFCommentsDecorator(XWPFParagraph paragraph, XWPFParagraphDecorator nextDecorator) {
super(paragraph, nextDecorator);
XWPFComment comment;
commentText = new StringBuffer();
for(CTMarkupRange anchor : paragraph.getCTP().getCommentRangeStartArray())
{
if((comment = paragraph.getDocRef().getCommentByID(anchor.getId().toString())) != null)
commentText.append("\tComment by " + comment.getAuthor()+": "+comment.getText());
}
}
public String getText() {
return super.getText() + commentText;
}
}

View File

@ -0,0 +1,69 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.model;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;;
/**
* Decorator class for XWPFParagraph allowing to add hyperlinks
* found in paragraph to its text.
*
* TODO - add the hyperlink text in the right place, and not just
* at the end
*/
public class XWPFHyperlinkDecorator extends XWPFParagraphDecorator {
private StringBuffer hyperlinkText;
/**
* @param nextDecorator The next decorator to use
* @param outputHyperlinkUrls Should we output the links too, or just the link text?
*/
public XWPFHyperlinkDecorator(XWPFParagraphDecorator nextDecorator, boolean outputHyperlinkUrls) {
this(nextDecorator.paragraph, nextDecorator, outputHyperlinkUrls);
}
/**
* @param prgrph The paragraph of text to work on
* @param outputHyperlinkUrls Should we output the links too, or just the link text?
*/
public XWPFHyperlinkDecorator(XWPFParagraph prgrph, XWPFParagraphDecorator nextDecorator, boolean outputHyperlinkUrls) {
super(prgrph, nextDecorator);
hyperlinkText = new StringBuffer();
// loop over hyperlink anchors
for(CTHyperlink link : paragraph.getCTP().getHyperlinkArray()){
for (CTR r : link.getRArray()) {
// Loop over text runs
for (CTText text : r.getTArray()){
hyperlinkText.append(text.getStringValue());
}
}
if(outputHyperlinkUrls && paragraph.getDocRef().getHyperlinkByID(link.getId()) != null) {
hyperlinkText.append(" <"+paragraph.getDocRef().getHyperlinkByID(link.getId()).getURL()+">");
}
}
}
public String getText()
{
return super.getText() + hyperlinkText;
}
}

View File

@ -0,0 +1,43 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.model;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
/**
* Base decorator class for XWPFParagraph
*/
public abstract class XWPFParagraphDecorator {
protected XWPFParagraph paragraph;
protected XWPFParagraphDecorator nextDecorator;
public XWPFParagraphDecorator(XWPFParagraph paragraph) {
this(paragraph, null);
}
public XWPFParagraphDecorator(XWPFParagraph paragraph, XWPFParagraphDecorator nextDecorator) {
this.paragraph = paragraph;
this.nextDecorator = nextDecorator;
}
public String getText() {
if(nextDecorator != null) {
return nextDecorator.getText();
}
return paragraph.getText();
}
}

View File

@ -0,0 +1,61 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTComment;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
/**
* Sketch of XWPF comment class
*
* @author Yury Batrakov (batrakov at gmail.com)
*
*/
public class XWPFComment
{
protected String id;
protected String author;
protected StringBuffer text;
public XWPFComment(CTComment comment)
{
text = new StringBuffer();
id = comment.getId().toString();
author = comment.getAuthor();
for(CTP ctp : comment.getPArray())
{
XWPFParagraph p = new XWPFParagraph(ctp);
text.append(p.getText());
}
}
public String getId()
{
return id;
}
public String getAuthor()
{
return author;
}
public String getText()
{
return text.toString();
}
}

View File

@ -0,0 +1,43 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
/**
* Sketch of XWPF hyperlink class
*
* @author Yury Batrakov (batrakov at gmail.com)
*
*/
public class XWPFHyperlink
{
String id, url;
public XWPFHyperlink(String id, String url)
{
this.id = id;
this.url = url;
}
public String getId()
{
return id;
}
public String getURL()
{
return url;
}
}

View File

@ -0,0 +1,70 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import org.apache.poi.xwpf.model.XMLParagraph;
import org.apache.poi.xwpf.XWPFDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
/**
* Sketch of XWPF paragraph class
*/
public class XWPFParagraph extends XMLParagraph
{
protected XWPFDocument docRef; // XXX: we'd like to have access to document's hyperlink, comments and other tables
/**
* TODO - replace with RichText String
*/
private StringBuffer text = new StringBuffer();
public XWPFParagraph(CTP prgrph, XWPFDocument docRef)
{
super(prgrph);
this.docRef = docRef;
CTR[] rs = paragraph.getRArray();
// Get text
for (int j = 0; j < rs.length; j++) {
// Loop over text runs
CTText[] texts = rs[j].getTArray();
for (int k = 0; k < texts.length; k++) {
text.append(
texts[k].getStringValue()
);
}
}
}
public XWPFParagraph(CTP prgrph) {
this(prgrph, null);
}
public XWPFParagraph(XMLParagraph paragraph) {
this(paragraph.getCTP());
}
public XWPFDocument getDocRef() {
return docRef;
}
public String getText() {
return text.toString();
}
}

View File

@ -0,0 +1,6 @@
package org.apache.poi.xwpf.usermodel;
public class XWPFParagraphText
{
}

View File

@ -0,0 +1,55 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xwpf.usermodel;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
/**
* Sketch of XWPFTable class. Only table's text is being hold.
*
* @author Yury Batrakov (batrakov at gmail.com)
*
*/
public class XWPFTable
{
protected StringBuffer text=new StringBuffer();
public XWPFTable(CTTbl table)
{
for(CTRow row : table.getTrArray())
{
for(CTTc cell : row.getTcArray())
{
for(CTP ctp : cell.getPArray())
{
XWPFParagraph p = new XWPFParagraph(ctp);
this.text.append(p.getText()+"\t");
}
}
this.text.append("\n");
}
}
public String getText()
{
return text.toString();
}
}