mirror of https://github.com/apache/poi.git
Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@685315 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3638f76a8a
commit
8f94b59e7e
|
@ -37,6 +37,7 @@
|
|||
|
||||
<!-- Don't forget to update status.xml too! -->
|
||||
<release version="3.5.1-beta2" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
<!-- Don't forget to update changes.xml too! -->
|
||||
<changes>
|
||||
<release version="3.5.1-beta2" date="2008-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Add POIXMLPropertiesTextExtractor, which provides to the OOXML file formats a similar function to HPSF's HPSFPropertiesExtractor</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">45539 - Improve XWPFWordExtractor to extract headers and footers</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">Improve how XWPF handles paragraph text</action>
|
||||
<action dev="POI-DEVELOPERS" type="add">Support in XWPF handles headers and footers</action>
|
||||
|
|
|
@ -38,8 +38,8 @@ import org.openxml4j.opc.PackagingURIHelper;
|
|||
public abstract class POIXMLDocument {
|
||||
|
||||
public static final String CORE_PROPERTIES_REL_TYPE = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties";
|
||||
|
||||
public static final String EXTENDED_PROPERTIES_REL_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties";
|
||||
public static final String CUSTOM_PROPERTIES_REL_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";
|
||||
|
||||
// OLE embeddings relation name
|
||||
public static final String OLE_OBJECT_REL_TYPE="http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
|
||||
|
|
|
@ -23,8 +23,6 @@ import org.openxml4j.exceptions.OpenXML4JException;
|
|||
import org.openxml4j.opc.Package;
|
||||
import org.openxml4j.opc.PackageRelationshipCollection;
|
||||
import org.openxml4j.opc.internal.PackagePropertiesPart;
|
||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
|
||||
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
|
||||
|
||||
/**
|
||||
* Wrapper around the two different kinds of OOXML properties
|
||||
|
@ -34,6 +32,7 @@ public class POIXMLProperties {
|
|||
private Package pkg;
|
||||
private CoreProperties core;
|
||||
private ExtendedProperties ext;
|
||||
private CustomProperties cust;
|
||||
|
||||
public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException {
|
||||
this.pkg = docPackage;
|
||||
|
@ -52,12 +51,24 @@ public class POIXMLProperties {
|
|||
PackageRelationshipCollection extRel =
|
||||
pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE);
|
||||
if(extRel.size() == 1) {
|
||||
PropertiesDocument props = PropertiesDocument.Factory.parse(
|
||||
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument props = org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument.Factory.parse(
|
||||
pkg.getPart( extRel.getRelationship(0) ).getInputStream()
|
||||
);
|
||||
ext = new ExtendedProperties(props);
|
||||
} else {
|
||||
ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance());
|
||||
ext = new ExtendedProperties(org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument.Factory.newInstance());
|
||||
}
|
||||
|
||||
// Custom properties
|
||||
PackageRelationshipCollection custRel =
|
||||
pkg.getRelationshipsByType(POIXMLDocument.CUSTOM_PROPERTIES_REL_TYPE);
|
||||
if(custRel.size() == 1) {
|
||||
org.openxmlformats.schemas.officeDocument.x2006.customProperties.PropertiesDocument props = org.openxmlformats.schemas.officeDocument.x2006.customProperties.PropertiesDocument.Factory.parse(
|
||||
pkg.getPart( custRel.getRelationship(0) ).getInputStream()
|
||||
);
|
||||
cust = new CustomProperties(props);
|
||||
} else {
|
||||
cust = new CustomProperties(org.openxmlformats.schemas.officeDocument.x2006.customProperties.PropertiesDocument.Factory.newInstance());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -75,6 +86,13 @@ public class POIXMLProperties {
|
|||
return ext;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the custom document properties
|
||||
*/
|
||||
public CustomProperties getCustomProperties() {
|
||||
return cust;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out the ooxml properties into the supplied,
|
||||
* new Package
|
||||
|
@ -108,8 +126,8 @@ public class POIXMLProperties {
|
|||
* Extended document properties
|
||||
*/
|
||||
public class ExtendedProperties {
|
||||
private PropertiesDocument props;
|
||||
private ExtendedProperties(PropertiesDocument props) {
|
||||
private org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument props;
|
||||
private ExtendedProperties(org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument props) {
|
||||
this.props = props;
|
||||
|
||||
if(props.getProperties() == null) {
|
||||
|
@ -117,7 +135,25 @@ public class POIXMLProperties {
|
|||
}
|
||||
}
|
||||
|
||||
public CTProperties getUnderlyingProperties() {
|
||||
public org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties getUnderlyingProperties() {
|
||||
return props.getProperties();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom document properties
|
||||
*/
|
||||
public class CustomProperties {
|
||||
private org.openxmlformats.schemas.officeDocument.x2006.customProperties.PropertiesDocument props;
|
||||
private CustomProperties(org.openxmlformats.schemas.officeDocument.x2006.customProperties.PropertiesDocument props) {
|
||||
this.props = props;
|
||||
|
||||
if(props.getProperties() == null) {
|
||||
props.addNewProperties();
|
||||
}
|
||||
}
|
||||
|
||||
public org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties getUnderlyingProperties() {
|
||||
return props.getProperties();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,142 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.openxml4j.opc.internal.PackagePropertiesPart;
|
||||
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
|
||||
|
||||
/**
|
||||
* A {@link POITextExtractor} for returning the textual
|
||||
* content of the OOXML file properties, eg author
|
||||
* and title.
|
||||
*/
|
||||
public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
|
||||
/**
|
||||
* Creates a new POIXMLPropertiesTextExtractor for the
|
||||
* given open document.
|
||||
*/
|
||||
public POIXMLPropertiesTextExtractor(POIXMLDocument doc) {
|
||||
super(doc);
|
||||
}
|
||||
/**
|
||||
* Creates a new POIXMLPropertiesTextExtractor, for the
|
||||
* same file that another TextExtractor is already
|
||||
* working on.
|
||||
*/
|
||||
public POIXMLPropertiesTextExtractor(POIXMLTextExtractor otherExtractor) {
|
||||
super(otherExtractor.document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the core document properties, eg author
|
||||
*/
|
||||
public String getCorePropertiesText() throws IOException, OpenXML4JException, XmlException {
|
||||
StringBuffer text = new StringBuffer();
|
||||
PackagePropertiesPart props =
|
||||
document.getProperties().getCoreProperties().getUnderlyingProperties();
|
||||
|
||||
text.append("Category = " + props.getCategoryProperty().getValue() + "\n");
|
||||
text.append("ContentStatus = " + props.getContentStatusProperty().getValue() + "\n");
|
||||
text.append("ContentType = " + props.getContentTypeProperty().getValue() + "\n");
|
||||
text.append("Created = " + props.getCreatedProperty().getValue() + "\n");
|
||||
text.append("CreatedString = " + props.getCreatedPropertyString() + "\n");
|
||||
text.append("Creator = " + props.getCreatorProperty().getValue() + "\n");
|
||||
text.append("Description = " + props.getDescriptionProperty().getValue() + "\n");
|
||||
text.append("Identifier = " + props.getIdentifierProperty().getValue() + "\n");
|
||||
text.append("Keywords = " + props.getKeywordsProperty().getValue() + "\n");
|
||||
text.append("Language = " + props.getLanguageProperty().getValue() + "\n");
|
||||
text.append("LastModifiedBy = " + props.getLastModifiedByProperty().getValue() + "\n");
|
||||
text.append("LastPrinted = " + props.getLastPrintedProperty().getValue() + "\n");
|
||||
text.append("LastPrintedString = " + props.getLastPrintedPropertyString() + "\n");
|
||||
text.append("Modified = " + props.getModifiedProperty().getValue() + "\n");
|
||||
text.append("ModifiedString = " + props.getModifiedPropertyString() + "\n");
|
||||
text.append("Revision = " + props.getRevisionProperty().getValue() + "\n");
|
||||
text.append("Subject = " + props.getSubjectProperty().getValue() + "\n");
|
||||
text.append("Title = " + props.getTitleProperty().getValue() + "\n");
|
||||
text.append("Version = " + props.getVersionProperty().getValue() + "\n");
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
/**
|
||||
* Returns the extended document properties, eg
|
||||
* application
|
||||
*/
|
||||
public String getExtendedPropertiesText() throws IOException, OpenXML4JException, XmlException {
|
||||
StringBuffer text = new StringBuffer();
|
||||
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties
|
||||
props = document.getProperties().getExtendedProperties().getUnderlyingProperties();
|
||||
|
||||
text.append("Application = " + props.getApplication() + "\n");
|
||||
text.append("AppVersion = " + props.getAppVersion() + "\n");
|
||||
text.append("Characters = " + props.getCharacters() + "\n");
|
||||
text.append("CharactersWithSpaces = " + props.getCharactersWithSpaces() + "\n");
|
||||
text.append("Company = " + props.getCompany() + "\n");
|
||||
text.append("HyperlinkBase = " + props.getHyperlinkBase() + "\n");
|
||||
text.append("HyperlinksChanged = " + props.getHyperlinksChanged() + "\n");
|
||||
text.append("Lines = " + props.getLines() + "\n");
|
||||
text.append("LinksUpToDate = " + props.getLinksUpToDate() + "\n");
|
||||
text.append("Manager = " + props.getManager() + "\n");
|
||||
text.append("Pages = " + props.getPages() + "\n");
|
||||
text.append("Paragraphs = " + props.getParagraphs() + "\n");
|
||||
text.append("PresentationFormat = " + props.getPresentationFormat() + "\n");
|
||||
text.append("Template = " + props.getTemplate() + "\n");
|
||||
text.append("TotalTime = " + props.getTotalTime() + "\n");
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
/**
|
||||
* Returns the custom document properties, if
|
||||
* there are any
|
||||
*/
|
||||
public String getCustomPropertiesText() throws IOException, OpenXML4JException, XmlException {
|
||||
StringBuffer text = new StringBuffer();
|
||||
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
|
||||
props = document.getProperties().getCustomProperties().getUnderlyingProperties();
|
||||
|
||||
CTProperty[] properties = props.getPropertyArray();
|
||||
for(int i = 0; i<properties.length; i++) {
|
||||
// TODO - finish off
|
||||
String val = "(not implemented!)";
|
||||
|
||||
text.append(
|
||||
properties[i].getName() +
|
||||
" = " + val + "\n"
|
||||
);
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
try {
|
||||
return
|
||||
getCorePropertiesText() +
|
||||
getExtendedPropertiesText() +
|
||||
getCustomPropertiesText();
|
||||
} catch(Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public POITextExtractor getMetadataTextExtractor() {
|
||||
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
|
||||
}
|
||||
}
|
|
@ -47,6 +47,12 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
|||
public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException {
|
||||
return document.getProperties().getExtendedProperties();
|
||||
}
|
||||
/**
|
||||
* Returns the custom document properties
|
||||
*/
|
||||
public CustomProperties getCustomProperties() throws IOException, OpenXML4JException, XmlException {
|
||||
return document.getProperties().getCustomProperties();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns opened document
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
||||
import org.openxml4j.opc.Package;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestXMLPropertiesTextExtractor extends TestCase {
|
||||
private String dirname;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
dirname = System.getProperty("OOXML.testdata.path");
|
||||
assertTrue( (new File(dirname)).exists() );
|
||||
}
|
||||
|
||||
public void testCore() throws Exception {
|
||||
org.openxml4j.opc.Package pkg = Package.open(
|
||||
(new File(dirname, "ExcelWithAttachments.xlsx")).toString()
|
||||
);
|
||||
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||
|
||||
POIXMLPropertiesTextExtractor ext = new POIXMLPropertiesTextExtractor(wb);
|
||||
ext.getText();
|
||||
|
||||
// Now check
|
||||
String text = ext.getText();
|
||||
String cText = ext.getCorePropertiesText();
|
||||
|
||||
assertTrue(text.contains("LastModifiedBy = Yury Batrakov"));
|
||||
assertTrue(cText.contains("LastModifiedBy = Yury Batrakov"));
|
||||
}
|
||||
|
||||
public void testExtended() throws Exception {
|
||||
org.openxml4j.opc.Package pkg = Package.open(
|
||||
(new File(dirname, "ExcelWithAttachments.xlsx")).toString()
|
||||
);
|
||||
XSSFWorkbook wb = new XSSFWorkbook(pkg);
|
||||
|
||||
POIXMLPropertiesTextExtractor ext = new POIXMLPropertiesTextExtractor(wb);
|
||||
ext.getText();
|
||||
|
||||
// Now check
|
||||
String text = ext.getText();
|
||||
String eText = ext.getExtendedPropertiesText();
|
||||
System.out.println(eText);
|
||||
|
||||
assertTrue(text.contains("Application = Microsoft Excel"));
|
||||
assertTrue(text.contains("Company = Mera"));
|
||||
assertTrue(eText.contains("Application = Microsoft Excel"));
|
||||
assertTrue(eText.contains("Company = Mera"));
|
||||
}
|
||||
|
||||
public void testCustom() throws Exception {
|
||||
// TODO!
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue