From 71085a913bca601805bd7d2d3aba747312321c83 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sun, 30 Dec 2007 18:11:55 +0000 Subject: [PATCH] OOXML pptx text extractor, and test. Also add jar-ooxml ant task git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607572 13f79535-47bb-0310-9956-ffa450edef68 --- build.xml | 15 +++ .../extractor/HXFPowerPointExtractor.java | 122 ++++++++++++++++++ .../org/apache/poi/hslf/TestHSLFXML.java | 1 - .../extractor/TestHXFPowerPointExtractor.java | 101 +++++++++++++++ 4 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java create mode 100644 src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java diff --git a/build.xml b/build.xml index a11bb94262..5989ddad3e 100644 --- a/build.xml +++ b/build.xml @@ -1124,6 +1124,21 @@ FORREST_HOME environment variable! + + + + + + + + + + + + + + + diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java new file mode 100644 index 0000000000..b0e7364019 --- /dev/null +++ b/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java @@ -0,0 +1,122 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hslf.extractor; + +import java.io.File; +import java.io.IOException; + +import org.apache.poi.POIXMLTextExtractor; +import org.apache.poi.hslf.HSLFXML; +import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow; +import org.apache.poi.hxf.HXFDocument; +import org.apache.xmlbeans.XmlException; +import org.openxml4j.exceptions.OpenXML4JException; +import org.openxml4j.opc.Package; +import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun; +import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; +import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; +import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; +import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide; +import org.openxmlformats.schemas.presentationml.x2006.main.CTShape; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; + +public class HXFPowerPointExtractor extends POIXMLTextExtractor { + private HSLFXMLSlideShow slideshow; + + public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException { + this(new HSLFXMLSlideShow( + new HSLFXML(container) + )); + } + public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) { + super(slideshow); + this.slideshow = slideshow; + } + + public static void main(String[] args) throws Exception { + if(args.length < 1) { + System.err.println("Use:"); + System.err.println(" HXFPowerPointExtractor "); + System.exit(1); + } + POIXMLTextExtractor extractor = + new HXFPowerPointExtractor(HXFDocument.openPackage( + new File(args[0]) + )); + System.out.println(extractor.getText()); + } + + /** + * Gets the slide and notes text + */ + public String getText() { + return getText(true, true); + } + + /** + * Gets the requested text from the file + * @param slideText Should we retrieve text from slides? + * @param notesText Should we retrieve text from notes? + */ + public String getText(boolean slideText, boolean notesText) { + StringBuffer text = new StringBuffer(); + + CTSlideIdListEntry[] slideRefs = + slideshow._getHSLFXML().getSlideReferences().getSldIdArray(); + for (int i = 0; i < slideRefs.length; i++) { + try { + CTSlide slide = + slideshow._getHSLFXML().getSlide(slideRefs[i]); + CTNotesSlide notes = + slideshow._getHSLFXML().getNotes(slideRefs[i]); + + if(slideText) { + extractText(slide.getCSld().getSpTree(), text); + } + if(notesText && notes != null) { + extractText(notes.getCSld().getSpTree(), text); + } + } catch(Exception e) { + throw new RuntimeException(e); + } + } + + return text.toString(); + } + + private void extractText(CTGroupShape gs, StringBuffer text) { + CTShape[] shapes = gs.getSpArray(); + for (int i = 0; i < shapes.length; i++) { + CTTextBody textBody = + shapes[i].getTxBody(); + if(textBody != null) { + CTTextParagraph[] paras = + textBody.getPArray(); + for (int j = 0; j < paras.length; j++) { + CTRegularTextRun[] textRuns = + paras[j].getRArray(); + for (int k = 0; k < textRuns.length; k++) { + text.append( textRuns[k].getT() ); + } + // End each paragraph with a new line + text.append("\n"); + } + } + } + } +} diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java index 11e7efd288..9c122da6dc 100644 --- a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java +++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java @@ -21,7 +21,6 @@ import java.io.File; import org.apache.poi.hxf.HXFDocument; import org.openxml4j.opc.Package; import org.openxml4j.opc.PackagePart; -import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry; diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java new file mode 100644 index 0000000000..7c96c2986e --- /dev/null +++ b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java @@ -0,0 +1,101 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hslf.extractor; + +import java.io.File; + +import org.apache.poi.hslf.HSLFXML; +import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow; +import org.apache.poi.hxf.HXFDocument; + +import junit.framework.TestCase; + +/** + * Tests for HXFPowerPointExtractor + */ +public class TestHXFPowerPointExtractor extends TestCase { + /** + * A simple file + */ + private HSLFXML xmlA; + + protected void setUp() throws Exception { + super.setUp(); + + File fileA = new File( + System.getProperty("HSLF.testdata.path") + + File.separator + "sample.pptx" + ); + + xmlA = new HSLFXML(HXFDocument.openPackage(fileA)); + } + + /** + * Get text out of the simple file + */ + public void testGetSimpleText() throws Exception { + new HXFPowerPointExtractor(xmlA.getPackage()); + new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA)); + + HXFPowerPointExtractor extractor = + new HXFPowerPointExtractor(xmlA.getPackage()); + extractor.getText(); + + String text = extractor.getText(); + assertTrue(text.length() > 0); + + // Check Basics + assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n")); + assertTrue(text.endsWith("amet\n\n\n\n")); + + // Just slides, no notes + text = extractor.getText(true, false); + assertEquals( + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n", text + ); + + // Just notes, no slides + text = extractor.getText(false, true); + assertEquals( + "\n\n\n\n", text + ); + + // Both + text = extractor.getText(true, true); + assertEquals( + "Lorem ipsum dolor sit amet\n" + + "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" + + "\n\n\n" + + "Lorem ipsum dolor sit amet\n" + + "Lorem\n" + + "ipsum\n" + + "dolor\n" + + "sit\n" + + "amet\n" + + "\n\n\n", text + ); + } +}