mirror of https://github.com/apache/poi.git
OOXML pptx text extractor, and test. Also add jar-ooxml ant task
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607572 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
44e5f549c4
commit
71085a913b
15
build.xml
15
build.xml
|
@ -1124,6 +1124,21 @@ FORREST_HOME environment variable!</echo>
|
|||
</manifest>
|
||||
</jar>
|
||||
</target>
|
||||
<target name="jar-ooxml" depends="compile-ooxml" description="Creates the ooxml jar files for distribution">
|
||||
<jar destfile="${dist.dir}/${jar.name}-ooxml-${version.id}-${DSTAMP}.jar">
|
||||
<fileset dir="${ooxml.output.dir}" />
|
||||
<fileset dir="legal/" />
|
||||
<manifest>
|
||||
<attribute name="Built-By" value="${user.name}"/>
|
||||
<attribute name="Specification-Title" value="Apache POI"/>
|
||||
<attribute name="Specification-Version" value="${version.id}-${DSTAMP}"/>
|
||||
<attribute name="Specification-Vendor" value="Apache"/>
|
||||
<attribute name="Implementation-Title" value="Apache POI"/>
|
||||
<attribute name="Implementation-Version" value="${version.id}-${DSTAMP}"/>
|
||||
<attribute name="Implementation-Vendor" value="Apache"/>
|
||||
</manifest>
|
||||
</jar>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="clean, fail-unless-tools-are-available, compile, site, jar"
|
||||
description="Creates the entire distribution into build/dist, from scratch">
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hslf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIXMLTextExtractor;
|
||||
import org.apache.poi.hslf.HSLFXML;
|
||||
import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
|
||||
import org.apache.poi.hxf.HXFDocument;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.openxml4j.opc.Package;
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
|
||||
|
||||
public class HXFPowerPointExtractor extends POIXMLTextExtractor {
|
||||
private HSLFXMLSlideShow slideshow;
|
||||
|
||||
public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
||||
this(new HSLFXMLSlideShow(
|
||||
new HSLFXML(container)
|
||||
));
|
||||
}
|
||||
public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
|
||||
super(slideshow);
|
||||
this.slideshow = slideshow;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" HXFPowerPointExtractor <filename.pptx>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new HXFPowerPointExtractor(HXFDocument.openPackage(
|
||||
new File(args[0])
|
||||
));
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the slide and notes text
|
||||
*/
|
||||
public String getText() {
|
||||
return getText(true, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the requested text from the file
|
||||
* @param slideText Should we retrieve text from slides?
|
||||
* @param notesText Should we retrieve text from notes?
|
||||
*/
|
||||
public String getText(boolean slideText, boolean notesText) {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
CTSlideIdListEntry[] slideRefs =
|
||||
slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
|
||||
for (int i = 0; i < slideRefs.length; i++) {
|
||||
try {
|
||||
CTSlide slide =
|
||||
slideshow._getHSLFXML().getSlide(slideRefs[i]);
|
||||
CTNotesSlide notes =
|
||||
slideshow._getHSLFXML().getNotes(slideRefs[i]);
|
||||
|
||||
if(slideText) {
|
||||
extractText(slide.getCSld().getSpTree(), text);
|
||||
}
|
||||
if(notesText && notes != null) {
|
||||
extractText(notes.getCSld().getSpTree(), text);
|
||||
}
|
||||
} catch(Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
private void extractText(CTGroupShape gs, StringBuffer text) {
|
||||
CTShape[] shapes = gs.getSpArray();
|
||||
for (int i = 0; i < shapes.length; i++) {
|
||||
CTTextBody textBody =
|
||||
shapes[i].getTxBody();
|
||||
if(textBody != null) {
|
||||
CTTextParagraph[] paras =
|
||||
textBody.getPArray();
|
||||
for (int j = 0; j < paras.length; j++) {
|
||||
CTRegularTextRun[] textRuns =
|
||||
paras[j].getRArray();
|
||||
for (int k = 0; k < textRuns.length; k++) {
|
||||
text.append( textRuns[k].getT() );
|
||||
}
|
||||
// End each paragraph with a new line
|
||||
text.append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -21,7 +21,6 @@ import java.io.File;
|
|||
import org.apache.poi.hxf.HXFDocument;
|
||||
import org.openxml4j.opc.Package;
|
||||
import org.openxml4j.opc.PackagePart;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
|
||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
|
||||
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hslf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.poi.hslf.HSLFXML;
|
||||
import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
|
||||
import org.apache.poi.hxf.HXFDocument;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Tests for HXFPowerPointExtractor
|
||||
*/
|
||||
public class TestHXFPowerPointExtractor extends TestCase {
|
||||
/**
|
||||
* A simple file
|
||||
*/
|
||||
private HSLFXML xmlA;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
File fileA = new File(
|
||||
System.getProperty("HSLF.testdata.path") +
|
||||
File.separator + "sample.pptx"
|
||||
);
|
||||
|
||||
xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get text out of the simple file
|
||||
*/
|
||||
public void testGetSimpleText() throws Exception {
|
||||
new HXFPowerPointExtractor(xmlA.getPackage());
|
||||
new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
|
||||
|
||||
HXFPowerPointExtractor extractor =
|
||||
new HXFPowerPointExtractor(xmlA.getPackage());
|
||||
extractor.getText();
|
||||
|
||||
String text = extractor.getText();
|
||||
assertTrue(text.length() > 0);
|
||||
|
||||
// Check Basics
|
||||
assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
|
||||
assertTrue(text.endsWith("amet\n\n\n\n"));
|
||||
|
||||
// Just slides, no notes
|
||||
text = extractor.getText(true, false);
|
||||
assertEquals(
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
"\n" +
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Lorem\n" +
|
||||
"ipsum\n" +
|
||||
"dolor\n" +
|
||||
"sit\n" +
|
||||
"amet\n" +
|
||||
"\n", text
|
||||
);
|
||||
|
||||
// Just notes, no slides
|
||||
text = extractor.getText(false, true);
|
||||
assertEquals(
|
||||
"\n\n\n\n", text
|
||||
);
|
||||
|
||||
// Both
|
||||
text = extractor.getText(true, true);
|
||||
assertEquals(
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
|
||||
"\n\n\n" +
|
||||
"Lorem ipsum dolor sit amet\n" +
|
||||
"Lorem\n" +
|
||||
"ipsum\n" +
|
||||
"dolor\n" +
|
||||
"sit\n" +
|
||||
"amet\n" +
|
||||
"\n\n\n", text
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue