mirror of https://github.com/apache/poi.git
Tweak how we do ooxml properties, and handle hyperlinks for word documents when extracting
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646298 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0650fb522f
commit
b2e48a2767
|
@ -32,9 +32,6 @@ import org.openxml4j.opc.PackageRelationship;
|
||||||
import org.openxml4j.opc.PackageRelationshipCollection;
|
import org.openxml4j.opc.PackageRelationshipCollection;
|
||||||
import org.openxml4j.opc.PackageRelationshipTypes;
|
import org.openxml4j.opc.PackageRelationshipTypes;
|
||||||
import org.openxml4j.opc.PackagingURIHelper;
|
import org.openxml4j.opc.PackagingURIHelper;
|
||||||
import org.openxml4j.opc.internal.PackagePropertiesPart;
|
|
||||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
|
|
||||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
|
|
||||||
|
|
||||||
public abstract class POIXMLDocument {
|
public abstract class POIXMLDocument {
|
||||||
|
|
||||||
|
@ -48,6 +45,12 @@ public abstract class POIXMLDocument {
|
||||||
/** The OPC core Package Part */
|
/** The OPC core Package Part */
|
||||||
private PackagePart corePart;
|
private PackagePart corePart;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The properties of the OPC package, opened as needed
|
||||||
|
*/
|
||||||
|
private POIXMLProperties properties;
|
||||||
|
|
||||||
|
|
||||||
protected POIXMLDocument() {}
|
protected POIXMLDocument() {}
|
||||||
|
|
||||||
protected POIXMLDocument(Package pkg) throws IOException {
|
protected POIXMLDocument(Package pkg) throws IOException {
|
||||||
|
@ -178,28 +181,13 @@ public abstract class POIXMLDocument {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the core document properties (core ooxml properties).
|
* Get the document properties. This gives you access to the
|
||||||
* TODO: Replace with nice usermodel wrapper
|
* core ooxml properties, and the extended ooxml properties.
|
||||||
* @deprecated To be replaced with a proper user-model style view of the properties
|
|
||||||
*/
|
*/
|
||||||
public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException {
|
public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException {
|
||||||
PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE);
|
if(properties == null) {
|
||||||
if(propsPart == null) {
|
properties = new POIXMLProperties(pkg);
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
return (PackagePropertiesPart)propsPart;
|
return properties;
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the extended document properties (extended ooxml properties)
|
|
||||||
* TODO: Replace with nice usermodel wrapper
|
|
||||||
* @deprecated To be replaced with a proper user-model style view of the properties
|
|
||||||
*/
|
|
||||||
public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException {
|
|
||||||
PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE);
|
|
||||||
|
|
||||||
PropertiesDocument props = PropertiesDocument.Factory.parse(
|
|
||||||
propsPart.getInputStream());
|
|
||||||
return props.getProperties();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,124 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
|
import org.openxml4j.opc.Package;
|
||||||
|
import org.openxml4j.opc.PackageRelationshipCollection;
|
||||||
|
import org.openxml4j.opc.internal.PackagePropertiesPart;
|
||||||
|
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
|
||||||
|
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrapper around the two different kinds of OOXML properties
|
||||||
|
* a document can have
|
||||||
|
*/
|
||||||
|
public class POIXMLProperties {
|
||||||
|
private Package pkg;
|
||||||
|
private CoreProperties core;
|
||||||
|
private ExtendedProperties ext;
|
||||||
|
|
||||||
|
public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException {
|
||||||
|
this.pkg = docPackage;
|
||||||
|
|
||||||
|
// Core properties
|
||||||
|
PackageRelationshipCollection coreRel =
|
||||||
|
pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE);
|
||||||
|
if(coreRel.size() == 1) {
|
||||||
|
core = new CoreProperties( (PackagePropertiesPart)
|
||||||
|
pkg.getPart(coreRel.getRelationship(0)) );
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("A document must always have core properties defined!");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extended properties
|
||||||
|
PackageRelationshipCollection extRel =
|
||||||
|
pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE);
|
||||||
|
if(extRel.size() == 1) {
|
||||||
|
PropertiesDocument props = PropertiesDocument.Factory.parse(
|
||||||
|
pkg.getPart( extRel.getRelationship(0) ).getInputStream()
|
||||||
|
);
|
||||||
|
ext = new ExtendedProperties(props);
|
||||||
|
} else {
|
||||||
|
ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the core document properties
|
||||||
|
*/
|
||||||
|
public CoreProperties getCoreProperties() {
|
||||||
|
return core;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the extended document properties
|
||||||
|
*/
|
||||||
|
public ExtendedProperties getExtendedProperties() {
|
||||||
|
return ext;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes out the ooxml properties into the supplied,
|
||||||
|
* new Package
|
||||||
|
*/
|
||||||
|
public void write(Package pkg) {
|
||||||
|
// TODO
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The core document properties
|
||||||
|
*/
|
||||||
|
public class CoreProperties {
|
||||||
|
private PackagePropertiesPart part;
|
||||||
|
private CoreProperties(PackagePropertiesPart part) {
|
||||||
|
this.part = part;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTitle(String title) {
|
||||||
|
part.setTitleProperty(title);
|
||||||
|
}
|
||||||
|
public String getTitle() {
|
||||||
|
return part.getTitleProperty().getValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
public PackagePropertiesPart getUnderlyingProperties() {
|
||||||
|
return part;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extended document properties
|
||||||
|
*/
|
||||||
|
public class ExtendedProperties {
|
||||||
|
private PropertiesDocument props;
|
||||||
|
private ExtendedProperties(PropertiesDocument props) {
|
||||||
|
this.props = props;
|
||||||
|
|
||||||
|
if(props.getProperties() == null) {
|
||||||
|
props.addNewProperties();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public CTProperties getUnderlyingProperties() {
|
||||||
|
return props.getProperties();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,6 +16,12 @@
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi;
|
package org.apache.poi;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.poi.POIXMLProperties.*;
|
||||||
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
|
|
||||||
public abstract class POIXMLTextExtractor extends POITextExtractor {
|
public abstract class POIXMLTextExtractor extends POITextExtractor {
|
||||||
/** The POIXMLDocument that's open */
|
/** The POIXMLDocument that's open */
|
||||||
protected POIXMLDocument document;
|
protected POIXMLDocument document;
|
||||||
|
@ -28,4 +34,17 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
||||||
|
|
||||||
this.document = document;
|
this.document = document;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the core document properties
|
||||||
|
*/
|
||||||
|
public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException {
|
||||||
|
return document.getProperties().getCoreProperties();
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Returns the extended document properties
|
||||||
|
*/
|
||||||
|
public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException {
|
||||||
|
return document.getProperties().getExtendedProperties();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.openxml4j.exceptions.InvalidFormatException;
|
||||||
import org.openxml4j.exceptions.OpenXML4JException;
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
import org.openxml4j.opc.Package;
|
import org.openxml4j.opc.Package;
|
||||||
import org.openxml4j.opc.PackagePart;
|
import org.openxml4j.opc.PackagePart;
|
||||||
|
import org.openxml4j.opc.PackageRelationshipCollection;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
|
||||||
|
@ -47,6 +48,7 @@ public class XWPFDocument extends POIXMLDocument {
|
||||||
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
|
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
|
||||||
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
|
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
|
||||||
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
|
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
|
||||||
|
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
|
||||||
|
|
||||||
private DocumentDocument wordDoc;
|
private DocumentDocument wordDoc;
|
||||||
|
|
||||||
|
@ -89,4 +91,18 @@ public class XWPFDocument extends POIXMLDocument {
|
||||||
StylesDocument.Factory.parse(parts[0].getInputStream());
|
StylesDocument.Factory.parse(parts[0].getInputStream());
|
||||||
return sd.getStyles();
|
return sd.getStyles();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns all the hyperlink relations for the file.
|
||||||
|
* You'll generally want to get the target to get
|
||||||
|
* the destination of the hyperlink
|
||||||
|
*/
|
||||||
|
public PackageRelationshipCollection getHyperlinks() {
|
||||||
|
try {
|
||||||
|
return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE);
|
||||||
|
} catch(InvalidFormatException e) {
|
||||||
|
// Should never happen
|
||||||
|
throw new IllegalStateException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,6 @@
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.xwpf.extractor;
|
package org.apache.poi.xwpf.extractor;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.poi.POIXMLDocument;
|
import org.apache.poi.POIXMLDocument;
|
||||||
|
@ -25,7 +24,9 @@ import org.apache.poi.xwpf.XWPFDocument;
|
||||||
import org.apache.xmlbeans.XmlException;
|
import org.apache.xmlbeans.XmlException;
|
||||||
import org.openxml4j.exceptions.OpenXML4JException;
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
import org.openxml4j.opc.Package;
|
import org.openxml4j.opc.Package;
|
||||||
|
import org.openxml4j.opc.PackageRelationship;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
|
||||||
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
|
||||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
||||||
|
@ -35,6 +36,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
|
||||||
*/
|
*/
|
||||||
public class XWPFWordExtractor extends POIXMLTextExtractor {
|
public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||||
private XWPFDocument document;
|
private XWPFDocument document;
|
||||||
|
private boolean fetchHyperlinks = false;
|
||||||
|
|
||||||
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
||||||
this(new XWPFDocument(container));
|
this(new XWPFDocument(container));
|
||||||
|
@ -56,6 +58,15 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||||
));
|
));
|
||||||
System.out.println(extractor.getText());
|
System.out.println(extractor.getText());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should we also fetch the hyperlinks, when fetching
|
||||||
|
* the text content? Default is to only output the
|
||||||
|
* hyperlink label, and not the contents
|
||||||
|
*/
|
||||||
|
public void setFetchHyperlinks(boolean fetch) {
|
||||||
|
fetchHyperlinks = fetch;
|
||||||
|
}
|
||||||
|
|
||||||
public String getText() {
|
public String getText() {
|
||||||
CTBody body = document.getDocumentBody();
|
CTBody body = document.getDocumentBody();
|
||||||
|
@ -64,9 +75,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||||
// Loop over paragraphs
|
// Loop over paragraphs
|
||||||
CTP[] ps = body.getPArray();
|
CTP[] ps = body.getPArray();
|
||||||
for (int i = 0; i < ps.length; i++) {
|
for (int i = 0; i < ps.length; i++) {
|
||||||
// Loop over ranges
|
// Loop over ranges and hyperlinks
|
||||||
|
// TODO - properly intersperce ranges and hyperlinks
|
||||||
CTR[] rs = ps[i].getRArray();
|
CTR[] rs = ps[i].getRArray();
|
||||||
for (int j = 0; j < rs.length; j++) {
|
for(int j = 0; j < rs.length; j++) {
|
||||||
// Loop over text runs
|
// Loop over text runs
|
||||||
CTText[] texts = rs[j].getTArray();
|
CTText[] texts = rs[j].getTArray();
|
||||||
for (int k = 0; k < texts.length; k++) {
|
for (int k = 0; k < texts.length; k++) {
|
||||||
|
@ -75,6 +87,26 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CTHyperlink[] hls = ps[i].getHyperlinkArray();
|
||||||
|
for(CTHyperlink hl : hls) {
|
||||||
|
for(CTR r : hl.getRArray()) {
|
||||||
|
for(CTText txt : r.getTArray()) {
|
||||||
|
text.append(txt.getStringValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(fetchHyperlinks) {
|
||||||
|
String id = hl.getId();
|
||||||
|
if(id != null) {
|
||||||
|
PackageRelationship hlRel =
|
||||||
|
document.getHyperlinks().getRelationshipByID(id);
|
||||||
|
if(hlRel != null) {
|
||||||
|
text.append(" <" + hlRel.getTargetURI().toString() + ">");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// New line after each paragraph.
|
// New line after each paragraph.
|
||||||
text.append("\n");
|
text.append("\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class TestXSLFSlideShow extends TestCase {
|
||||||
if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
|
if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
|
||||||
found = true;
|
found = true;
|
||||||
}
|
}
|
||||||
System.out.println(part);
|
//System.out.println(part);
|
||||||
}
|
}
|
||||||
assertTrue(found);
|
assertTrue(found);
|
||||||
}
|
}
|
||||||
|
@ -110,14 +110,14 @@ public class TestXSLFSlideShow extends TestCase {
|
||||||
public void testMetadataBasics() throws Exception {
|
public void testMetadataBasics() throws Exception {
|
||||||
XSLFSlideShow xml = new XSLFSlideShow(sampleFile);
|
XSLFSlideShow xml = new XSLFSlideShow(sampleFile);
|
||||||
|
|
||||||
assertNotNull(xml.getCoreProperties());
|
assertNotNull(xml.getProperties().getCoreProperties());
|
||||||
assertNotNull(xml.getExtendedProperties());
|
assertNotNull(xml.getProperties().getExtendedProperties());
|
||||||
|
|
||||||
assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
|
assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
|
||||||
assertEquals(0, xml.getExtendedProperties().getCharacters());
|
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
|
||||||
assertEquals(0, xml.getExtendedProperties().getLines());
|
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
|
||||||
|
|
||||||
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
|
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
|
||||||
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
|
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,29 +92,29 @@ public class TestXWPFDocument extends TestCase {
|
||||||
XWPFDocument xml = new XWPFDocument(
|
XWPFDocument xml = new XWPFDocument(
|
||||||
POIXMLDocument.openPackage(sampleFile.toString())
|
POIXMLDocument.openPackage(sampleFile.toString())
|
||||||
);
|
);
|
||||||
assertNotNull(xml.getCoreProperties());
|
assertNotNull(xml.getProperties().getCoreProperties());
|
||||||
assertNotNull(xml.getExtendedProperties());
|
assertNotNull(xml.getProperties().getExtendedProperties());
|
||||||
|
|
||||||
assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
|
assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
|
||||||
assertEquals(1315, xml.getExtendedProperties().getCharacters());
|
assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
|
||||||
assertEquals(10, xml.getExtendedProperties().getLines());
|
assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
|
||||||
|
|
||||||
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
|
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
|
||||||
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
|
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMetadataComplex() throws Exception {
|
public void testMetadataComplex() throws Exception {
|
||||||
XWPFDocument xml = new XWPFDocument(
|
XWPFDocument xml = new XWPFDocument(
|
||||||
POIXMLDocument.openPackage(complexFile.toString())
|
POIXMLDocument.openPackage(complexFile.toString())
|
||||||
);
|
);
|
||||||
assertNotNull(xml.getCoreProperties());
|
assertNotNull(xml.getProperties().getCoreProperties());
|
||||||
assertNotNull(xml.getExtendedProperties());
|
assertNotNull(xml.getProperties().getExtendedProperties());
|
||||||
|
|
||||||
assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
|
assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
|
||||||
assertEquals(5184, xml.getExtendedProperties().getCharacters());
|
assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
|
||||||
assertEquals(0, xml.getExtendedProperties().getLines());
|
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
|
||||||
|
|
||||||
assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
|
assertEquals(" ", xml.getProperties().getCoreProperties().getTitle());
|
||||||
assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
|
assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,12 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||||
*/
|
*/
|
||||||
private XWPFDocument xmlB;
|
private XWPFDocument xmlB;
|
||||||
private File fileB;
|
private File fileB;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File with hyperlinks
|
||||||
|
*/
|
||||||
|
private XWPFDocument xmlC;
|
||||||
|
private File fileC;
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
|
@ -49,11 +55,17 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||||
System.getProperty("HWPF.testdata.path") +
|
System.getProperty("HWPF.testdata.path") +
|
||||||
File.separator + "IllustrativeCases.docx"
|
File.separator + "IllustrativeCases.docx"
|
||||||
);
|
);
|
||||||
|
fileC = new File(
|
||||||
|
System.getProperty("HWPF.testdata.path") +
|
||||||
|
File.separator + "TestDocument.docx"
|
||||||
|
);
|
||||||
assertTrue(fileA.exists());
|
assertTrue(fileA.exists());
|
||||||
assertTrue(fileB.exists());
|
assertTrue(fileB.exists());
|
||||||
|
assertTrue(fileC.exists());
|
||||||
|
|
||||||
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
|
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
|
||||||
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
|
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
|
||||||
|
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -117,4 +129,32 @@ public class TestXWPFWordExtractor extends TestCase {
|
||||||
}
|
}
|
||||||
assertEquals(79, ps);
|
assertEquals(79, ps);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testGetWithHyperlinks() throws Exception {
|
||||||
|
XWPFWordExtractor extractor =
|
||||||
|
new XWPFWordExtractor(xmlC);
|
||||||
|
extractor.getText();
|
||||||
|
extractor.setFetchHyperlinks(true);
|
||||||
|
extractor.getText();
|
||||||
|
|
||||||
|
// Now check contents
|
||||||
|
// TODO - fix once correctly handling contents
|
||||||
|
extractor.setFetchHyperlinks(false);
|
||||||
|
assertEquals(
|
||||||
|
// "This is a test document\nThis bit is in bold and italic\n" +
|
||||||
|
// "Back to normal\nWe have a hyperlink here, and another.\n",
|
||||||
|
"This is a test document\nThis bit is in bold and italic\n" +
|
||||||
|
"Back to normal\nWe have a here, and .hyperlinkanother\n",
|
||||||
|
extractor.getText()
|
||||||
|
);
|
||||||
|
|
||||||
|
extractor.setFetchHyperlinks(true);
|
||||||
|
assertEquals(
|
||||||
|
// "This is a test document\nThis bit is in bold and italic\n" +
|
||||||
|
// "Back to normal\nWe have a hyperlink here, and another.\n",
|
||||||
|
"This is a test document\nThis bit is in bold and italic\n" +
|
||||||
|
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
|
||||||
|
extractor.getText()
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue