Tweak how we do ooxml properties, and handle hyperlinks for word documents when extracting

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646298 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-04-09 12:22:23 +00:00
parent 0650fb522f
commit b2e48a2767
9 changed files with 268 additions and 49 deletions

View File

@ -32,9 +32,6 @@ import org.openxml4j.opc.PackageRelationship;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.PackageRelationshipTypes;
import org.openxml4j.opc.PackagingURIHelper;
import org.openxml4j.opc.internal.PackagePropertiesPart;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
public abstract class POIXMLDocument {
@ -48,6 +45,12 @@ public abstract class POIXMLDocument {
/** The OPC core Package Part */
private PackagePart corePart;
/**
* The properties of the OPC package, opened as needed
*/
private POIXMLProperties properties;
protected POIXMLDocument() {}
protected POIXMLDocument(Package pkg) throws IOException {
@ -178,28 +181,13 @@ public abstract class POIXMLDocument {
}
/**
* Get the core document properties (core ooxml properties).
* TODO: Replace with nice usermodel wrapper
* @deprecated To be replaced with a proper user-model style view of the properties
* Get the document properties. This gives you access to the
* core ooxml properties, and the extended ooxml properties.
*/
public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException {
PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE);
if(propsPart == null) {
return null;
public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException {
if(properties == null) {
properties = new POIXMLProperties(pkg);
}
return (PackagePropertiesPart)propsPart;
}
/**
* Get the extended document properties (extended ooxml properties)
* TODO: Replace with nice usermodel wrapper
* @deprecated To be replaced with a proper user-model style view of the properties
*/
public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException {
PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE);
PropertiesDocument props = PropertiesDocument.Factory.parse(
propsPart.getInputStream());
return props.getProperties();
return properties;
}
}

View File

@ -0,0 +1,124 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;
import java.io.IOException;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.internal.PackagePropertiesPart;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
/**
* Wrapper around the two different kinds of OOXML properties
* a document can have
*/
public class POIXMLProperties {
private Package pkg;
private CoreProperties core;
private ExtendedProperties ext;
public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException {
this.pkg = docPackage;
// Core properties
PackageRelationshipCollection coreRel =
pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE);
if(coreRel.size() == 1) {
core = new CoreProperties( (PackagePropertiesPart)
pkg.getPart(coreRel.getRelationship(0)) );
} else {
throw new IllegalArgumentException("A document must always have core properties defined!");
}
// Extended properties
PackageRelationshipCollection extRel =
pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE);
if(extRel.size() == 1) {
PropertiesDocument props = PropertiesDocument.Factory.parse(
pkg.getPart( extRel.getRelationship(0) ).getInputStream()
);
ext = new ExtendedProperties(props);
} else {
ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance());
}
}
/**
* Returns the core document properties
*/
public CoreProperties getCoreProperties() {
return core;
}
/**
* Returns the extended document properties
*/
public ExtendedProperties getExtendedProperties() {
return ext;
}
/**
* Writes out the ooxml properties into the supplied,
* new Package
*/
public void write(Package pkg) {
// TODO
}
/**
* The core document properties
*/
public class CoreProperties {
private PackagePropertiesPart part;
private CoreProperties(PackagePropertiesPart part) {
this.part = part;
}
public void setTitle(String title) {
part.setTitleProperty(title);
}
public String getTitle() {
return part.getTitleProperty().getValue();
}
public PackagePropertiesPart getUnderlyingProperties() {
return part;
}
}
/**
* Extended document properties
*/
public class ExtendedProperties {
private PropertiesDocument props;
private ExtendedProperties(PropertiesDocument props) {
this.props = props;
if(props.getProperties() == null) {
props.addNewProperties();
}
}
public CTProperties getUnderlyingProperties() {
return props.getProperties();
}
}
}

View File

@ -16,6 +16,12 @@
==================================================================== */
package org.apache.poi;
import java.io.IOException;
import org.apache.poi.POIXMLProperties.*;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
public abstract class POIXMLTextExtractor extends POITextExtractor {
/** The POIXMLDocument that's open */
protected POIXMLDocument document;
@ -28,4 +34,17 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
this.document = document;
}
/**
* Returns the core document properties
*/
public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException {
return document.getProperties().getCoreProperties();
}
/**
* Returns the extended document properties
*/
public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException {
return document.getProperties().getExtendedProperties();
}
}

View File

@ -24,6 +24,7 @@ import org.openxml4j.exceptions.InvalidFormatException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
@ -47,6 +48,7 @@ public class XWPFDocument extends POIXMLDocument {
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
private DocumentDocument wordDoc;
@ -89,4 +91,18 @@ public class XWPFDocument extends POIXMLDocument {
StylesDocument.Factory.parse(parts[0].getInputStream());
return sd.getStyles();
}
/**
* Returns all the hyperlink relations for the file.
* You'll generally want to get the target to get
* the destination of the hyperlink
*/
public PackageRelationshipCollection getHyperlinks() {
try {
return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE);
} catch(InvalidFormatException e) {
// Should never happen
throw new IllegalStateException(e);
}
}
}

View File

@ -16,7 +16,6 @@
==================================================================== */
package org.apache.poi.xwpf.extractor;
import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLDocument;
@ -25,7 +24,9 @@ import org.apache.poi.xwpf.XWPFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageRelationship;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
@ -35,6 +36,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
*/
public class XWPFWordExtractor extends POIXMLTextExtractor {
private XWPFDocument document;
private boolean fetchHyperlinks = false;
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new XWPFDocument(container));
@ -56,6 +58,15 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
));
System.out.println(extractor.getText());
}
/**
* Should we also fetch the hyperlinks, when fetching
* the text content? Default is to only output the
* hyperlink label, and not the contents
*/
public void setFetchHyperlinks(boolean fetch) {
fetchHyperlinks = fetch;
}
public String getText() {
CTBody body = document.getDocumentBody();
@ -64,9 +75,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
// Loop over paragraphs
CTP[] ps = body.getPArray();
for (int i = 0; i < ps.length; i++) {
// Loop over ranges
// Loop over ranges and hyperlinks
// TODO - properly intersperce ranges and hyperlinks
CTR[] rs = ps[i].getRArray();
for (int j = 0; j < rs.length; j++) {
for(int j = 0; j < rs.length; j++) {
// Loop over text runs
CTText[] texts = rs[j].getTArray();
for (int k = 0; k < texts.length; k++) {
@ -75,6 +87,26 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
);
}
}
CTHyperlink[] hls = ps[i].getHyperlinkArray();
for(CTHyperlink hl : hls) {
for(CTR r : hl.getRArray()) {
for(CTText txt : r.getTArray()) {
text.append(txt.getStringValue());
}
}
if(fetchHyperlinks) {
String id = hl.getId();
if(id != null) {
PackageRelationship hlRel =
document.getHyperlinks().getRelationshipByID(id);
if(hlRel != null) {
text.append(" <" + hlRel.getTargetURI().toString() + ">");
}
}
}
}
// New line after each paragraph.
text.append("\n");
}

View File

@ -46,7 +46,7 @@ public class TestXSLFSlideShow extends TestCase {
if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
found = true;
}
System.out.println(part);
//System.out.println(part);
}
assertTrue(found);
}
@ -110,14 +110,14 @@ public class TestXSLFSlideShow extends TestCase {
public void testMetadataBasics() throws Exception {
XSLFSlideShow xml = new XSLFSlideShow(sampleFile);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
assertEquals(0, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
}

View File

@ -92,29 +92,29 @@ public class TestXWPFDocument extends TestCase {
XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(sampleFile.toString())
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
assertEquals(1315, xml.getExtendedProperties().getCharacters());
assertEquals(10, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
public void testMetadataComplex() throws Exception {
XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(complexFile.toString())
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
assertEquals(5184, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(" ", xml.getProperties().getCoreProperties().getTitle());
assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
}

View File

@ -37,6 +37,12 @@ public class TestXWPFWordExtractor extends TestCase {
*/
private XWPFDocument xmlB;
private File fileB;
/**
* File with hyperlinks
*/
private XWPFDocument xmlC;
private File fileC;
protected void setUp() throws Exception {
super.setUp();
@ -49,11 +55,17 @@ public class TestXWPFWordExtractor extends TestCase {
System.getProperty("HWPF.testdata.path") +
File.separator + "IllustrativeCases.docx"
);
fileC = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "TestDocument.docx"
);
assertTrue(fileA.exists());
assertTrue(fileB.exists());
assertTrue(fileC.exists());
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
}
/**
@ -117,4 +129,32 @@ public class TestXWPFWordExtractor extends TestCase {
}
assertEquals(79, ps);
}
public void testGetWithHyperlinks() throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlC);
extractor.getText();
extractor.setFetchHyperlinks(true);
extractor.getText();
// Now check contents
// TODO - fix once correctly handling contents
extractor.setFetchHyperlinks(false);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlinkanother\n",
extractor.getText()
);
extractor.setFetchHyperlinks(true);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
extractor.getText()
);
}
}