mirror of https://github.com/apache/poi.git
Add Visio OOXML text extractor + tests
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1709361 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9716fd9a06
commit
bc6ee96e1a
|
@ -105,7 +105,7 @@ public class TestAllFiles {
|
|||
// Visio - binary
|
||||
HANDLERS.put(".vsd", new HDGFFileHandler());
|
||||
|
||||
// Visio - ooxml (currently unsupported)
|
||||
// Visio - ooxml
|
||||
HANDLERS.put(".vsdm", new XDGFFileHandler());
|
||||
HANDLERS.put(".vsdx", new XDGFFileHandler());
|
||||
HANDLERS.put(".vssm", new XDGFFileHandler());
|
||||
|
|
|
@ -16,19 +16,11 @@
|
|||
==================================================================== */
|
||||
package org.apache.poi.stress;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
|
||||
import org.apache.poi.util.PackageHelper;
|
||||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
|
||||
import org.junit.Test;
|
||||
|
||||
public class XDGFFileHandler extends AbstractFileHandler {
|
||||
|
@ -37,39 +29,19 @@ public class XDGFFileHandler extends AbstractFileHandler {
|
|||
// ignore password protected files
|
||||
if (POIXMLDocumentHandler.isEncrypted(stream)) return;
|
||||
|
||||
TestXDGFXMLDocument doc = new TestXDGFXMLDocument(stream);
|
||||
XmlVisioDocument doc = new XmlVisioDocument(stream);
|
||||
new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleExtracting(File file) throws Exception {
|
||||
// TODO: extraction/actual operations not supported yet
|
||||
}
|
||||
|
||||
|
||||
// a test-case to test this locally without executing the full TestAllFiles
|
||||
@Test
|
||||
public void test() throws Exception {
|
||||
OPCPackage pkg = OPCPackage.open("test-data/diagram/test.vsdx", PackageAccess.READ);
|
||||
try {
|
||||
TestXDGFXMLDocument doc = new TestXDGFXMLDocument(pkg);
|
||||
XmlVisioDocument doc = new XmlVisioDocument(pkg);
|
||||
new POIXMLDocumentHandler().handlePOIXMLDocument(doc);
|
||||
} finally {
|
||||
pkg.close();
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Get rid of this when full visio ooxml support is added
|
||||
private final static class TestXDGFXMLDocument extends POIXMLDocument {
|
||||
public TestXDGFXMLDocument(OPCPackage pkg) {
|
||||
super(pkg, PackageRelationshipTypes.VISIO_CORE_DOCUMENT);
|
||||
}
|
||||
|
||||
public TestXDGFXMLDocument(InputStream is) throws IOException {
|
||||
this(PackageHelper.open(is));
|
||||
}
|
||||
|
||||
public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
|
||||
return new ArrayList<PackagePart>();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -55,6 +55,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
|
|||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
||||
|
@ -172,11 +173,9 @@ public class ExtractorFactory {
|
|||
}
|
||||
if (core.size() == 0) {
|
||||
// Could it be a visio one?
|
||||
PackageRelationshipCollection visio =
|
||||
pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
||||
if (visio.size() == 1) {
|
||||
throw new IllegalArgumentException("Text extraction not supported for Visio OOXML files");
|
||||
}
|
||||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
||||
if (core.size() == 1)
|
||||
return new XDGFVisioExtractor(pkg);
|
||||
}
|
||||
|
||||
// Should just be a single core document, complain if not
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.poi.xdgf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIXMLDocument;
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.xdgf.usermodel.XDGFPage;
|
||||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
|
||||
import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor;
|
||||
|
||||
/**
|
||||
* Helper class to extract text from an OOXML Visio File
|
||||
*/
|
||||
public class XDGFVisioExtractor extends POIXMLTextExtractor {
|
||||
|
||||
protected final XmlVisioDocument document;
|
||||
|
||||
public XDGFVisioExtractor(XmlVisioDocument document) {
|
||||
super(document);
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
public XDGFVisioExtractor(OPCPackage openPackage) throws IOException {
|
||||
this(new XmlVisioDocument(openPackage));
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
ShapeTextVisitor visitor = new ShapeTextVisitor();
|
||||
|
||||
for (XDGFPage page: document.getPages()) {
|
||||
page.getContent().visitShapes(visitor);
|
||||
}
|
||||
|
||||
return visitor.getText().toString();
|
||||
}
|
||||
|
||||
public static void main(String [] args) throws IOException {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" XDGFVisioExtractor <filename.vsdx>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new XDGFVisioExtractor(POIXMLDocument.openPackage(
|
||||
args[0]
|
||||
));
|
||||
System.out.println(extractor.getText());
|
||||
extractor.close();
|
||||
}
|
||||
}
|
|
@ -29,6 +29,9 @@ import com.microsoft.schemas.office.visio.x2012.main.VisioDocumentType;
|
|||
|
||||
/**
|
||||
* Represents the root document: /visio/document.xml
|
||||
*
|
||||
* You're probably actually looking for {@link XmlVisioDocument}, this
|
||||
* only contains metadata about the root document in the OOXML package.
|
||||
*/
|
||||
public class XDGFDocument {
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.poi.xdgf.usermodel;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -107,15 +108,21 @@ public class XmlVisioDocument extends POIXMLDocument {
|
|||
_pages.onDocumentRead();
|
||||
}
|
||||
|
||||
/**
|
||||
* Not currently implemented
|
||||
*/
|
||||
@Override
|
||||
public List<PackagePart> getAllEmbedds() throws OpenXML4JException {
|
||||
throw new UnsupportedOperationException("Not implemented");
|
||||
return new ArrayList<PackagePart>();
|
||||
}
|
||||
|
||||
//
|
||||
// Useful public API goes here
|
||||
//
|
||||
|
||||
/**
|
||||
* @return pages ordered by page number
|
||||
*/
|
||||
public Collection<XDGFPage> getPages() {
|
||||
return _pages.getPageList();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.poi.xdgf.usermodel.shape;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
|
||||
import org.apache.poi.xdgf.usermodel.XDGFShape;
|
||||
|
||||
/**
|
||||
* Only visits text nodes, accumulates text content into a string
|
||||
*
|
||||
* The text is returned in arbitrary order, with no regards to
|
||||
* the location of the text on the page. This may change in the
|
||||
* future.
|
||||
*/
|
||||
public class ShapeTextVisitor extends ShapeVisitor {
|
||||
|
||||
protected StringBuilder text = new StringBuilder();
|
||||
|
||||
public static class TextAcceptor implements ShapeVisitorAcceptor {
|
||||
public boolean accept(XDGFShape shape) {
|
||||
return shape.hasText();
|
||||
}
|
||||
}
|
||||
|
||||
protected ShapeVisitorAcceptor getAcceptor() {
|
||||
return new TextAcceptor();
|
||||
}
|
||||
|
||||
public void visit(XDGFShape shape, AffineTransform globalTransform,
|
||||
int level) {
|
||||
text.append(shape.getText().getTextContent().trim());
|
||||
text.append('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Call this after visitation has completed
|
||||
*/
|
||||
public String getText() {
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -44,6 +44,7 @@ import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
|
|||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||
|
@ -271,12 +272,13 @@ public class TestExtractorFactory {
|
|||
ExtractorFactory.createExtractor(vsd).getText().length() > 50
|
||||
);
|
||||
// Visio - vsdx
|
||||
try {
|
||||
ExtractorFactory.createExtractor(vsdx);
|
||||
fail();
|
||||
} catch(IllegalArgumentException e) {
|
||||
// Good
|
||||
}
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsdx)
|
||||
instanceof XDGFVisioExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(vsdx).getText().length() > 20
|
||||
);
|
||||
|
||||
// Publisher
|
||||
assertTrue(
|
||||
|
@ -391,13 +393,15 @@ public class TestExtractorFactory {
|
|||
ExtractorFactory.createExtractor(new FileInputStream(vsd)).getText().length() > 50
|
||||
);
|
||||
// Visio - vsdx
|
||||
try {
|
||||
ExtractorFactory.createExtractor(new FileInputStream(vsdx));
|
||||
fail();
|
||||
} catch(IllegalArgumentException e) {
|
||||
// Good
|
||||
}
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new FileInputStream(vsdx))
|
||||
instanceof XDGFVisioExtractor
|
||||
);
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new FileInputStream(vsdx)).getText().length() > 20
|
||||
);
|
||||
|
||||
|
||||
// Publisher
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(new FileInputStream(pub))
|
||||
|
@ -551,6 +555,15 @@ public class TestExtractorFactory {
|
|||
extractor.getText().length() > 120
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
// Visio
|
||||
assertTrue(
|
||||
ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()))
|
||||
instanceof XDGFVisioExtractor
|
||||
);
|
||||
assertTrue(
|
||||
extractor.getText().length() > 20
|
||||
);
|
||||
|
||||
// Text
|
||||
try {
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
package org.apache.poi.xdgf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestXDGFVisioExtractor extends TestCase {
|
||||
|
||||
private POIDataSamples diagrams;
|
||||
private OPCPackage pkg;
|
||||
private XmlVisioDocument xml;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
diagrams = POIDataSamples.getDiagramInstance();
|
||||
|
||||
pkg = OPCPackage.open(diagrams.openResourceAsStream("test_text_extraction.vsdx"));
|
||||
xml = new XmlVisioDocument(pkg);
|
||||
}
|
||||
|
||||
public void testGetSimpleText() throws IOException {
|
||||
new XDGFVisioExtractor(xml).close();
|
||||
new XDGFVisioExtractor(pkg).close();
|
||||
|
||||
XDGFVisioExtractor extractor = new XDGFVisioExtractor(xml);
|
||||
extractor.getText();
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
assertEquals("Text here\nText there\nText, text, everywhere!\nRouter here\n",
|
||||
text);
|
||||
|
||||
extractor.close();
|
||||
}
|
||||
}
|
Binary file not shown.
Loading…
Reference in New Issue