#62319 - Decommission XSLF-/PowerPointExtractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1829653 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2018-04-20 12:52:59 +00:00
parent 154493dda3
commit e816131759
27 changed files with 824 additions and 1248 deletions

View File

@ -330,8 +330,6 @@ public class TestAllFiles {
);
private static final Set<String> IGNORED = unmodifiableHashSet(
// need JDK8+ - https://bugs.openjdk.java.net/browse/JDK-8038081
"slideshow/42474-2.ppt",
// OPC handler works / XSSF handler fails
"spreadsheet/57181.xlsm",
"spreadsheet/61300.xls"//intentionally fuzzed -- used to cause infinite loop

View File

@ -24,6 +24,7 @@ import java.io.FileInputStream;
import java.io.InputStream;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@ -53,12 +54,19 @@ public class XSLFFileHandler extends SlideShowHandler {
// additionally try the other getText() methods
try (XSLFPowerPointExtractor extractor = (XSLFPowerPointExtractor) ExtractorFactory.createExtractor(file)) {
try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
extractor.setMasterByDefault(true);
assertNotNull(extractor.getText(true, true, true));
assertEquals("With all options disabled we should not get text",
"", extractor.getText(false, false, false));
assertNotNull(extractor.getText());
extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(false);
assertEquals("With all options disabled we should not get text", "", extractor.getText());
}
}

View File

@ -105,6 +105,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
*
* @return the underlying POIDocument
*/
@Override
public POIDocument getDocument() {
return document;
}

View File

@ -74,4 +74,9 @@ public abstract class POITextExtractor implements Closeable {
fsToClose.close();
}
}
/**
* @return the processed document
*/
public abstract Object getDocument();
}

View File

@ -115,26 +115,23 @@ public class OLE2ExtractorFactory {
return threadPreferEventExtractors.get();
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
return (T)createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException {
return (T)createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException {
// Only ever an OLE2 one from the root of the FS
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException {
return (T)createExtractor(fs.getRoot());
}
public static POITextExtractor createExtractor(InputStream input) throws IOException {
public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
Class<?> cls = getOOXMLClass();
if (cls != null) {
// Use Reflection to get us the full OOXML-enabled version
try {
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
return (POITextExtractor)m.invoke(null, input);
return (T)m.invoke(null, input);
} catch (IllegalArgumentException iae) {
throw iae;
} catch (Exception e) {

View File

@ -45,7 +45,29 @@ public class DocumentFactoryHelper {
*/
public static InputStream getDecryptedStream(final NPOIFSFileSystem fs, String password)
throws IOException {
EncryptionInfo info = new EncryptionInfo(fs);
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
// as well when the resulting OPCPackage is closed
return new FilterInputStream(getDecryptedStream(fs.getRoot(), password)) {
@Override
public void close() throws IOException {
fs.close();
super.close();
}
};
}
/**
* Wrap the OLE2 data of the DirectoryNode into a decrypted stream by using
* the given password.
*
* @param root The OLE2 directory node for the document
* @param password The password, null if the default password should be used
* @return A stream for reading the decrypted data
* @throws IOException If an error occurs while decrypting or if the password does not match
*/
public static InputStream getDecryptedStream(final DirectoryNode root, String password)
throws IOException {
EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info);
try {
@ -58,20 +80,10 @@ public class DocumentFactoryHelper {
}
if (passwordCorrect) {
// wrap the stream in a FilterInputStream to close the NPOIFSFileSystem
// as well when the resulting OPCPackage is closed
return new FilterInputStream(d.getDataStream(fs.getRoot())) {
@Override
public void close() throws IOException {
fs.close();
super.close();
}
};
} else {
if (password != null)
return d.getDataStream(root);
} else if (password != null) {
throw new EncryptedDocumentException("Password incorrect");
else
} else {
throw new EncryptedDocumentException("The supplied spreadsheet is protected, but no password was supplied");
}
} catch (GeneralSecurityException e) {

View File

@ -1,3 +1,20 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.sl.extractor;
import java.util.ArrayList;
@ -48,6 +65,16 @@ public class SlideShowExtractor<
this.slideshow = slideshow;
}
/**
* Returns opened document
*
* @return the opened document
*/
@Override
public final Object getDocument() {
return slideshow.getPersistDocument();
}
/**
* Should a call to getText() return slide text? Default is yes
*/
@ -219,7 +246,6 @@ public class SlideShowExtractor<
return;
}
for (final P para : paraList) {
final int oldLen = sb.length();
for (final TextRun tr : para) {
final String str = tr.getRawText().replace("\r", "");
final String newStr;

View File

@ -126,4 +126,13 @@ public interface SlideShow<
* @since POI 4.0.0
*/
POITextExtractor getMetadataTextExtractor();
/**
* @return the instance which handles the persisting of the slideshow,
* which is either a subclass of {@link org.apache.poi.POIDocument}
* or {@link org.apache.poi.POIXMLDocument}
*
* @since POI 4.0.0
*/
Object getPersistDocument();
}

View File

@ -60,13 +60,40 @@ public class SlideShowFactory {
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final NPOIFSFileSystem fs, String password) throws IOException {
DirectoryNode root = fs.getRoot();
return create(fs.getRoot(), password);
}
/**
* Creates a SlideShow from the given NPOIFSFileSystem.
*
* @param root The {@link DirectoryNode} to start reading the document from
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final DirectoryNode root) throws IOException {
return create(root, null);
}
/**
* Creates a SlideShow from the given NPOIFSFileSystem, which may
* be password protected
*
* @param root The {@link DirectoryNode} to start reading the document from
* @param password The password that should be used or null if no password is necessary.
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
*/
public static SlideShow<?,?> create(final DirectoryNode root, String password) throws IOException {
// Encrypted OOXML files go inside OLE2 containers, is this one?
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
InputStream stream = null;
try {
stream = DocumentFactoryHelper.getDecryptedStream(fs, password);
stream = DocumentFactoryHelper.getDecryptedStream(root, password);
return createXSLFSlideShow(stream);
} finally {
@ -82,7 +109,7 @@ public class SlideShowFactory {
passwordSet = true;
}
try {
return createHSLFSlideShow(fs);
return createHSLFSlideShow(root);
} finally {
if (passwordSet) {
Biff8EncryptionKey.setCurrentUserPassword(null);

View File

@ -68,6 +68,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
*
* @return the opened document
*/
@Override
public final POIXMLDocument getDocument() {
return _document;
}

View File

@ -51,6 +51,7 @@ import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.NotImplemented;
import org.apache.poi.util.POILogFactory;
@ -58,6 +59,7 @@ import org.apache.poi.util.POILogger;
import org.apache.poi.util.Removal;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
@ -127,20 +129,20 @@ public class ExtractorFactory {
return OLE2ExtractorFactory.getPreferEventExtractor();
}
public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
NPOIFSFileSystem fs = null;
try {
fs = new NPOIFSFileSystem(f);
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
return createEncryptedOOXMLExtractor(fs);
return (T)createEncryptedOOXMLExtractor(fs);
}
POIOLE2TextExtractor extractor = createExtractor(fs);
POITextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs);
return extractor;
return (T)extractor;
} catch (OfficeXmlFileException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
return (T)createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ));
} catch (NotOLE2FileException ne) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
@ -179,7 +181,7 @@ public class ExtractorFactory {
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
@ -226,13 +228,13 @@ public class ExtractorFactory {
// Is it XSLF?
for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
return new XSLFPowerPointExtractor(pkg);
return new SlideShowExtractor(new XMLSlideShow(pkg));
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg));
return new SlideShowExtractor(new XMLSlideShow(pkg));
}
// How about xlsb?
@ -252,28 +254,28 @@ public class ExtractorFactory {
}
}
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
public static <T extends POITextExtractor> T createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot());
}
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return OLE2ExtractorFactory.createExtractor(fs);
public static <T extends POITextExtractor> T createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot());
}
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
{
// First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return createExtractor(pkg);
return (T)createExtractor(pkg);
}
}
// If not, ask the OLE2 code to check, with Scratchpad if possible
return OLE2ExtractorFactory.createExtractor(poifsDir);
return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
}
/**
@ -403,7 +405,7 @@ public class ExtractorFactory {
throw new IllegalStateException("Not yet supported");
}
private static POIXMLTextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
private static POITextExtractor createEncryptedOOXMLExtractor(NPOIFSFileSystem fs)
throws IOException {
String pass = Biff8EncryptionKey.getCurrentUserPassword();
if (pass == null) {

View File

@ -37,7 +37,7 @@ import org.apache.xmlbeans.XmlException;
* @deprecated use {@link SlideShowExtractor}
*/
@Deprecated
@Removal(version="4.2.0")
@Removal(version="5.0.0")
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
public static final XSLFRelation[] SUPPORTED_TYPES = new XSLFRelation[]{
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,

View File

@ -631,4 +631,9 @@ public class XMLSlideShow extends POIXMLDocument
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
return new POIXMLPropertiesTextExtractor(this);
}
@Override
public Object getPersistDocument() {
return this;
}
}

View File

@ -1,3 +1,20 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xslf.usermodel;
import static org.apache.poi.xslf.usermodel.XSLFShape.PML_NS;

View File

@ -182,12 +182,20 @@ implements Slide<XSLFShape,XSLFTextParagraph> {
*/
public XSLFCommentAuthors getCommentAuthorsPart() {
if(_commentAuthors == null) {
// first scan the slide relations
for (POIXMLDocumentPart p : getRelations()) {
if (p instanceof XSLFCommentAuthors) {
_commentAuthors = (XSLFCommentAuthors)p;
return _commentAuthors;
}
}
// then scan the presentation relations
for (POIXMLDocumentPart p : getSlideShow().getRelations()) {
if (p instanceof XSLFCommentAuthors) {
_commentAuthors = (XSLFCommentAuthors)p;
return _commentAuthors;
}
}
}
return null;

View File

@ -27,16 +27,15 @@ import static org.junit.Assert.fail;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Locale;
import org.apache.poi.POIDataSamples;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor;
import org.apache.poi.POIXMLException;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.OldExcelFormatException;
@ -44,18 +43,20 @@ import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.junit.BeforeClass;
import org.apache.xmlbeans.XmlException;
import org.junit.Test;
/**
@ -65,34 +66,39 @@ public class TestExtractorFactory {
private static final POILogger LOG = POILogFactory.getLogger(TestExtractorFactory.class);
private static File txt;
private static final POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
private static final File xls = getFileAndCheck(ssTests, "SampleSS.xls");
private static final File xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
private static final File xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
private static final File xltx = getFileAndCheck(ssTests, "test.xltx");
private static final File xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
private static final File xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
private static File xls;
private static File xlsx;
private static File xlsxStrict;
private static File xltx;
private static File xlsEmb;
private static File xlsb;
private static final POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
private static final File doc = getFileAndCheck(wpTests, "SampleDoc.doc");
private static final File doc6 = getFileAndCheck(wpTests, "Word6.doc");
private static final File doc95 = getFileAndCheck(wpTests, "Word95.doc");
private static final File docx = getFileAndCheck(wpTests, "SampleDoc.docx");
private static final File dotx = getFileAndCheck(wpTests, "test.dotx");
private static final File docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
private static final File docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
private static File doc;
private static File doc6;
private static File doc95;
private static File docx;
private static File dotx;
private static File docEmb;
private static File docEmbOOXML;
private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
private static final File ppt = getFileAndCheck(slTests, "SampleShow.ppt");
private static final File pptx = getFileAndCheck(slTests, "SampleShow.pptx");
private static final File txt = getFileAndCheck(slTests, "SampleShow.txt");
private static File ppt;
private static File pptx;
private static final POIDataSamples olTests = POIDataSamples.getHSMFInstance();
private static final File msg = getFileAndCheck(olTests, "quick.msg");
private static final File msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
private static final File msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
private static File msg;
private static File msgEmb;
private static File msgEmbMsg;
private static final POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
private static final File vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
private static final File vsdx = getFileAndCheck(dgTests, "test.vsdx");
private static File vsd;
private static File vsdx;
private static File pub;
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
private static File pub = getFileAndCheck(pubTests, "Simple.pub");
private static File getFileAndCheck(POIDataSamples samples, String name) {
File file = samples.getFile(name);
@ -104,595 +110,133 @@ public class TestExtractorFactory {
return file;
}
@BeforeClass
public static void setUp() throws Exception {
private static final Object[] TEST_SET = {
"Excel", xls, ExcelExtractor.class, 200,
"Excel - xlsx", xlsx, XSSFExcelExtractor.class, 200,
"Excel - xltx", xltx, XSSFExcelExtractor.class, -1,
"Excel - xlsb", xlsb, XSSFBEventBasedExcelExtractor.class, -1,
"Word", doc, WordExtractor.class, 120,
"Word - docx", docx, XWPFWordExtractor.class, 120,
"Word - dotx", dotx, XWPFWordExtractor.class, -1,
"Word 6", doc6, Word6Extractor.class, 20,
"Word 95", doc95, Word6Extractor.class, 120,
"PowerPoint", ppt, SlideShowExtractor.class, 120,
"PowerPoint - pptx", pptx, SlideShowExtractor.class, 120,
"Visio", vsd, VisioTextExtractor.class, 50,
"Visio - vsdx", vsdx, XDGFVisioExtractor.class, 20,
"Publisher", pub, PublisherTextExtractor.class, 50,
"Outlook msg", msg, OutlookTextExtactor.class, 50,
POIDataSamples ssTests = POIDataSamples.getSpreadSheetInstance();
xls = getFileAndCheck(ssTests, "SampleSS.xls");
xlsx = getFileAndCheck(ssTests, "SampleSS.xlsx");
xlsxStrict = getFileAndCheck(ssTests, "SampleSS.strict.xlsx");
xltx = getFileAndCheck(ssTests, "test.xltx");
xlsEmb = getFileAndCheck(ssTests, "excel_with_embeded.xls");
xlsb = getFileAndCheck(ssTests, "testVarious.xlsb");
// TODO Support OOXML-Strict, see bug #57699
// xlsxStrict
};
POIDataSamples wpTests = POIDataSamples.getDocumentInstance();
doc = getFileAndCheck(wpTests, "SampleDoc.doc");
doc6 = getFileAndCheck(wpTests, "Word6.doc");
doc95 = getFileAndCheck(wpTests, "Word95.doc");
docx = getFileAndCheck(wpTests, "SampleDoc.docx");
dotx = getFileAndCheck(wpTests, "test.dotx");
docEmb = getFileAndCheck(wpTests, "word_with_embeded.doc");
docEmbOOXML = getFileAndCheck(wpTests, "word_with_embeded_ooxml.doc");
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
ppt = getFileAndCheck(slTests, "SampleShow.ppt");
pptx = getFileAndCheck(slTests, "SampleShow.pptx");
txt = getFileAndCheck(slTests, "SampleShow.txt");
POIDataSamples dgTests = POIDataSamples.getDiagramInstance();
vsd = getFileAndCheck(dgTests, "Test_Visio-Some_Random_Text.vsd");
vsdx = getFileAndCheck(dgTests, "test.vsdx");
POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
pub = getFileAndCheck(pubTests, "Simple.pub");
POIDataSamples olTests = POIDataSamples.getHSMFInstance();
msg = getFileAndCheck(olTests, "quick.msg");
msgEmb = getFileAndCheck(olTests, "attachment_test_msg.msg");
msgEmbMsg = getFileAndCheck(olTests, "attachment_msg_pdf.msg");
@FunctionalInterface
interface FunctionEx<T, R> {
R apply(T t) throws IOException, OpenXML4JException, XmlException;
}
@Test
public void testFile() throws Exception {
// Excel
POITextExtractor xlsExtractor = ExtractorFactory.createExtractor(xls);
assertNotNull("Had empty extractor for " + xls, xlsExtractor);
assertTrue("Expected instanceof ExcelExtractor, but had: " + xlsExtractor.getClass(),
xlsExtractor
instanceof ExcelExtractor
);
assertTrue(
xlsExtractor.getText().length() > 200
);
xlsExtractor.close();
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(xlsb);
assertContains(extractor.getText(), "test");
extractor.close();
extractor = ExtractorFactory.createExtractor(xltx);
assertContains(extractor.getText(), "test");
extractor.close();
// TODO Support OOXML-Strict, see bug #57699
try {
/*extractor =*/ ExtractorFactory.createExtractor(xlsxStrict);
fail("OOXML-Strict isn't yet supported");
} catch (POIXMLException e) {
// Expected, for now
for (int i = 0; i < TEST_SET.length; i += 4) {
try (POITextExtractor ext = ExtractorFactory.createExtractor((File) TEST_SET[i + 1])) {
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
}
}
}
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor
// instanceof XSSFExcelExtractor
// );
// extractor.close();
//
// extractor = ExtractorFactory.createExtractor(xlsxStrict);
// assertTrue(
// extractor.getText().contains("test")
// );
// extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(doc);
assertTrue(
extractor
instanceof WordExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc6);
assertTrue(
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
extractor = ExtractorFactory.createExtractor(doc95);
assertTrue(
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(
extractor instanceof XWPFWordExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(docx);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertTrue(
extractor instanceof XWPFWordExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(dotx);
assertContains(extractor.getText(), "Test");
extractor.close();
// PowerPoint (PPT)
extractor = ExtractorFactory.createExtractor(ppt);
assertTrue(
extractor
instanceof PowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// PowerPoint (PPTX)
extractor = ExtractorFactory.createExtractor(pptx);
assertTrue(
extractor
instanceof XSLFPowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// Visio - binary
extractor = ExtractorFactory.createExtractor(vsd);
assertTrue(
extractor
instanceof VisioTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(vsdx);
assertTrue(
extractor
instanceof XDGFVisioExtractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(pub);
assertTrue(
extractor
instanceof PublisherTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(msg);
assertTrue(
extractor
instanceof OutlookTextExtactor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
@Test(expected = IllegalArgumentException.class)
public void testFileInvalid() throws Exception {
// Text
try {
ExtractorFactory.createExtractor(txt);
fail("expected IllegalArgumentException");
} catch(IllegalArgumentException e) {
// Good
}
try (POITextExtractor te = ExtractorFactory.createExtractor(txt)) {}
}
@Test
public void testInputStream() throws Exception {
// Excel
POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
assertTrue(
extractor
instanceof ExcelExtractor
);
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof XSSFExcelExtractor
);
assertTrue(
extractor.getText().length() > 200
);
// TODO Support OOXML-Strict, see bug #57699
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
// instanceof XSSFExcelExtractor
// );
// assertTrue(
// ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
// );
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof WordExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue(
extractor.getClass().getName(),
extractor
instanceof Word6Extractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
assertTrue(
extractor
instanceof XWPFWordExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
assertTrue(
extractor
instanceof PowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
assertTrue(
extractor
instanceof XSLFPowerPointExtractor
);
assertTrue(
extractor.getText().length() > 120
);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
assertTrue(
extractor
instanceof VisioTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Visio - vsdx
extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
assertTrue(
extractor
instanceof XDGFVisioExtractor
);
assertTrue(
extractor.getText().length() > 20
);
extractor.close();
// Publisher
extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
assertTrue(
extractor
instanceof PublisherTextExtractor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Outlook msg
extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
assertTrue(
extractor
instanceof OutlookTextExtactor
);
assertTrue(
extractor.getText().length() > 50
);
extractor.close();
// Text
try (FileInputStream stream = new FileInputStream(txt)) {
ExtractorFactory.createExtractor(stream);
fail("expected IllegalArgumentException");
} catch(IllegalArgumentException e) {
// Good
testStream((f) -> ExtractorFactory.createExtractor(f), true);
}
@Test(expected = IllegalArgumentException.class)
public void testInputStreamInvalid() throws Exception {
testInvalid((f) -> ExtractorFactory.createExtractor(f));
}
@Test
public void testPOIFS() throws Exception {
// Excel
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))
instanceof ExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
);
// Word
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc)))
instanceof WordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6)))
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95)))
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
);
// PowerPoint
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt)))
instanceof PowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
);
// Visio
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd)))
instanceof VisioTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
);
// Publisher
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub)))
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg)))
instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
);
// Text
try {
ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(txt)));
fail("expected IllegalArgumentException");
} catch(IOException e) {
// Good
}
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
}
@Test(expected = IOException.class)
public void testPOIFSInvalid() throws Exception {
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
}
@Test
public void testOPOIFS() throws Exception {
// Excel
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls)))
instanceof ExcelExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(xls))).getText().length() > 200
);
testStream((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)), false);
}
// Word
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc)))
instanceof WordExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc))).getText().length() > 120
);
@Test(expected = IOException.class)
public void testOPOIFSInvalid() throws Exception {
testInvalid((f) -> ExtractorFactory.createExtractor(new OPOIFSFileSystem(f)));
}
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6)))
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc6))).getText().length() > 20
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95)))
instanceof Word6Extractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(doc95))).getText().length() > 120
);
private void testStream(final FunctionEx<FileInputStream, POITextExtractor> poifsIS, final boolean loadOOXML)
throws IOException, OpenXML4JException, XmlException {
for (int i = 0; i < TEST_SET.length; i += 4) {
File testFile = (File) TEST_SET[i + 1];
if (!loadOOXML && (testFile.getName().endsWith("x") || testFile.getName().endsWith("xlsb"))) {
continue;
}
try (FileInputStream fis = new FileInputStream(testFile);
POITextExtractor ext = poifsIS.apply(fis)) {
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
} catch (IllegalArgumentException e) {
fail("failed to process "+testFile);
}
}
}
// PowerPoint
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt)))
instanceof PowerPointExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(ppt))).getText().length() > 120
);
// Visio
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd)))
instanceof VisioTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(vsd))).getText().length() > 50
);
// Publisher
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub)))
instanceof PublisherTextExtractor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(pub))).getText().length() > 50
);
// Outlook msg
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg)))
instanceof OutlookTextExtactor
);
assertTrue(
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(msg))).getText().length() > 50
);
private void testExtractor(final POITextExtractor ext, final String testcase, final Class extrClass, final Integer minLength) {
assertTrue("invalid extractor for " + testcase, extrClass.isInstance(ext));
final String actual = ext.getText();
if (minLength == -1) {
assertContains(actual.toLowerCase(Locale.ROOT), "test");
} else {
assertTrue("extracted content too short for " + testcase, actual.length() > minLength);
}
}
private void testInvalid(FunctionEx<FileInputStream, POITextExtractor> poifs) throws IOException, OpenXML4JException, XmlException {
// Text
try {
ExtractorFactory.createExtractor(new OPOIFSFileSystem(new FileInputStream(txt)));
fail("expected IllegalArgumentException");
} catch(IOException e) {
// Good
try (FileInputStream fis = new FileInputStream(txt);
POITextExtractor te = poifs.apply(fis)) {
}
}
@Test
public void testPackage() throws Exception {
// Excel
POIXMLTextExtractor extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(extractor instanceof XSSFExcelExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(extractor.getText().length() > 200);
extractor.close();
// Word
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor instanceof XWPFWordExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(docx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// PowerPoint
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor instanceof XSLFPowerPointExtractor);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(pptx.toString()));
assertTrue(extractor.getText().length() > 120);
extractor.close();
// Visio
extractor = ExtractorFactory.createExtractor(OPCPackage.open(vsdx.toString()));
assertTrue(extractor instanceof XDGFVisioExtractor);
assertTrue(extractor.getText().length() > 20);
extractor.close();
// Text
try {
ExtractorFactory.createExtractor(OPCPackage.open(txt.toString()));
fail("TestExtractorFactory.testPackage() failed on " + txt);
} catch(UnsupportedFileFormatException e) {
// Good
} catch (Exception e) {
LOG.log(POILogger.WARN, "TestExtractorFactory.testPackage() failed on " + txt);
throw e;
for (int i = 0; i < TEST_SET.length; i += 4) {
final File testFile = (File) TEST_SET[i + 1];
if (!testFile.getName().endsWith("x")) {
continue;
}
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
testExtractor(ext, (String) TEST_SET[i], (Class) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
pkg.revert();
}
}
}
@Test(expected = UnsupportedFileFormatException.class)
public void testPackageInvalid() throws Exception {
// Text
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
final POITextExtractor te = ExtractorFactory.createExtractor(pkg)) {}
}
@Test
@ -781,142 +325,49 @@ public class TestExtractorFactory {
* does poifs embedded, but will do ooxml ones
* at some point.
*/
@SuppressWarnings("deprecation")
@Test
public void testEmbedded() throws Exception {
POIOLE2TextExtractor ext;
POITextExtractor[] embeds;
final Object[] testObj = {
"No embeddings", xls, "0-0-0-0-0-0",
"Excel", xlsEmb, "6-2-2-2-0-0",
"Word", docEmb, "4-1-2-1-0-0",
"Word which contains an OOXML file", docEmbOOXML, "3-0-1-1-0-1",
"Outlook", msgEmb, "1-1-0-0-0-0",
"Outlook with another outlook file in it", msgEmbMsg, "1-0-0-0-1-0",
};
// No embeddings
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
for (int i=0; i<testObj.length; i+=3) {
try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
// No embeddings
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xls);
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
assertEquals(0, embeds.length);
ext.close();
// Excel
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
assertNotNull(embeds);
ext.close();
// Excel
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(xlsEmb);
embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
assertEquals(6, embeds.length);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
if (embed instanceof SlideShowExtractor) {
numPpt++;
} else if (embed instanceof ExcelExtractor) {
numXls++;
} else if (embed instanceof WordExtractor) {
numWord++;
} else if (embed instanceof OutlookTextExtactor) {
numMsg++;
} else if (embed instanceof XWPFWordExtractor) {
numWordX++;
}
assertEquals(2, numPpt);
assertEquals(2, numXls);
assertEquals(2, numWord);
assertEquals(0, numMsg);
ext.close();
// Word
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(docEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(4, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(1, numPpt);
assertEquals(2, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Word which contains an OOXML file
ext = (POIOLE2TextExtractor)
ExtractorFactory.createExtractor(docEmbOOXML);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0; numWordX = 0;
assertEquals(3, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
else if (embed instanceof XWPFWordExtractor) numWordX++;
final String actual = embeds.length+"-"+numWord+"-"+numXls+"-"+numPpt+"-"+numMsg+"-"+numWordX;
final String expected = (String)testObj[i+2];
assertEquals("invalid number of embeddings - "+testObj[i], expected, actual);
}
assertEquals(1, numPpt);
assertEquals(1, numXls);
assertEquals(0, numWord);
assertEquals(1, numWordX);
assertEquals(0, numMsg);
ext.close();
// Outlook
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmb);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(1, numWord);
assertEquals(0, numMsg);
ext.close();
// Outlook with another outlook file in it
ext = (OutlookTextExtactor)
ExtractorFactory.createExtractor(msgEmbMsg);
embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
numWord = 0; numXls = 0; numPpt = 0; numMsg = 0;
assertEquals(1, embeds.length);
for (POITextExtractor embed : embeds) {
assertTrue(embed.getText().length() > 20);
if (embed instanceof PowerPointExtractor) numPpt++;
else if (embed instanceof ExcelExtractor) numXls++;
else if (embed instanceof WordExtractor) numWord++;
else if (embed instanceof OutlookTextExtactor) numMsg++;
}
assertEquals(0, numPpt);
assertEquals(0, numXls);
assertEquals(0, numWord);
assertEquals(1, numMsg);
ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
private static final String[] EXPECTED_FAILURES = new String[] {
private static final String[] EXPECTED_FAILURES = {
// password protected files
"spreadsheet/password.xls",
"spreadsheet/protected_passtika.xlsx",
@ -1018,35 +469,24 @@ public class TestExtractorFactory {
* #59074 - Excel 95 files should give a helpful message, not just
* "No supported documents found in the OLE2 stream"
*/
@Test
@Test(expected = OldExcelFormatException.class)
public void bug59074() throws Exception {
try {
ExtractorFactory.createExtractor(
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
fail("Old excel formats not supported via ExtractorFactory");
} catch (OldExcelFormatException e) {
// expected here
}
}
@SuppressWarnings("deprecation")
@Test
public void testGetEmbeddedFromXMLExtractor() {
try {
@Test(expected = IllegalStateException.class)
public void testGetEmbedFromXMLExtractor() {
// currently not implemented
ExtractorFactory.getEmbededDocsTextExtractors((POIXMLTextExtractor) null);
fail("Unsupported currently");
} catch (IllegalStateException e) {
// expected here
}
try {
@SuppressWarnings("deprecation")
@Test(expected = IllegalStateException.class)
public void testGetEmbeddedFromXMLExtractor() {
// currently not implemented
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
fail("Unsupported currently");
} catch (IllegalStateException e) {
// expected here
}
}
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.

View File

@ -120,10 +120,10 @@ public class TestHxxFEncryption {
public void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file);
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
Biff8EncryptionKey.setCurrentUserPassword(newPass);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
POIDocument doc = te1.getDocument();
POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos);
doc.close();
te1.close();
@ -140,25 +140,25 @@ public class TestHxxFEncryption {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
Biff8EncryptionKey.setCurrentUserPassword(password);
File f = sampleDir.getFile(file);
POIOLE2TextExtractor te1 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(f);
POITextExtractor te1 = ExtractorFactory.createExtractor(f);
// first remove encryption
Biff8EncryptionKey.setCurrentUserPassword(null);
POIDocument doc = te1.getDocument();
POIDocument doc = (POIDocument)te1.getDocument();
doc.write(bos);
doc.close();
te1.close();
// then use default setting, which is cryptoapi
String newPass = "newPass";
POIOLE2TextExtractor te2 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
POITextExtractor te2 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
Biff8EncryptionKey.setCurrentUserPassword(newPass);
doc = te2.getDocument();
doc = (POIDocument)te2.getDocument();
bos.reset();
doc.write(bos);
doc.close();
te2.close();
// and finally update cryptoapi setting
POIOLE2TextExtractor te3 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = te3.getDocument();
POITextExtractor te3 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = (POIDocument)te3.getDocument();
// need to cache data (i.e. read all data) before changing the key size
if (doc instanceof HSLFSlideShowImpl) {
HSLFSlideShowImpl hss = (HSLFSlideShowImpl)doc;
@ -175,8 +175,8 @@ public class TestHxxFEncryption {
doc.close();
te3.close();
// check the setting
POIOLE2TextExtractor te4 = (POIOLE2TextExtractor)ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = te4.getDocument();
POITextExtractor te4 = ExtractorFactory.createExtractor(new ByteArrayInputStream(bos.toByteArray()));
doc = (POIDocument)te4.getDocument();
ei = doc.getEncryptionInfo();
assertNotNull(ei);
assertTrue(ei.getHeader() instanceof CryptoAPIEncryptionHeader);

View File

@ -50,6 +50,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.sl.draw.DrawPaint;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
import org.apache.poi.sl.usermodel.PaintStyle.TexturePaint;
@ -221,8 +222,8 @@ public class TestXSLFBugs {
* rID2 -> slide3.xml
*/
@Test
public void bug54916() throws Exception {
XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx");
public void bug54916() throws IOException {
try (XMLSlideShow ss = XSLFTestDataSamples.openSampleDocument("OverlappingRelations.pptx")) {
XSLFSlide slide;
// Should find 4 slides
@ -230,19 +231,18 @@ public class TestXSLFBugs {
// Check the text, to see we got them in order
slide = ss.getSlides().get(0);
assertContains(getSlideText(slide), "POI cannot read this");
assertContains(getSlideText(ss, slide), "POI cannot read this");
slide = ss.getSlides().get(1);
assertContains(getSlideText(slide), "POI can read this");
assertContains(getSlideText(slide), "Has a relationship to another slide");
assertContains(getSlideText(ss, slide), "POI can read this");
assertContains(getSlideText(ss, slide), "Has a relationship to another slide");
slide = ss.getSlides().get(2);
assertContains(getSlideText(slide), "POI can read this");
assertContains(getSlideText(ss, slide), "POI can read this");
slide = ss.getSlides().get(3);
assertContains(getSlideText(slide), "POI can read this");
ss.close();
assertContains(getSlideText(ss, slide), "POI can read this");
}
}
/**
@ -311,8 +311,15 @@ public class TestXSLFBugs {
ss.close();
}
protected String getSlideText(XSLFSlide slide) {
return XSLFPowerPointExtractor.getText(slide, true, false, false);
protected String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
try (SlideShowExtractor extr = new SlideShowExtractor(ppt)) {
// do not auto-close the slideshow
extr.setFilesystem(null);
extr.setSlidesByDefault(true);
extr.setNotesByDefault(false);
extr.setMasterByDefault(false);
return extr.getText(slide);
}
}
@Test
@ -458,7 +465,7 @@ public class TestXSLFBugs {
for (int i = 0; i < slideTexts.length; i++) {
XSLFSlide slide = ss.getSlides().get(i);
assertContains(getSlideText(slide), slideTexts[i]);
assertContains(getSlideText(ss, slide), slideTexts[i]);
}
}

View File

@ -24,16 +24,17 @@ import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.POIDataSamples;
import org.apache.poi.POITextExtractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.xmlbeans.XmlException;
import org.junit.Ignore;
import org.junit.Test;
/**
@ -44,21 +45,12 @@ public class TestXSLFPowerPointExtractor {
/**
* Get text out of the simple file
* @throws XmlException
* @throws OpenXML4JException
*/
@Test
public void testGetSimpleText()
throws IOException, XmlException, OpenXML4JException {
XMLSlideShow xmlA = openPPTX("sample.pptx");
@SuppressWarnings("resource")
OPCPackage pkg = xmlA.getPackage();
public void testGetSimpleText() throws IOException {
try (XMLSlideShow xmlA = openPPTX("sample.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xmlA)) {
new XSLFPowerPointExtractor(xmlA).close();
new XSLFPowerPointExtractor(pkg).close();
XSLFPowerPointExtractor extractor =
new XSLFPowerPointExtractor(xmlA);
extractor.getText();
String text = extractor.getText();
@ -82,7 +74,10 @@ public class TestXSLFPowerPointExtractor {
// "Fifth level\n";
// Just slides, no notes
text = extractor.getText(true, false, false);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(false);
text = extractor.getText();
String slideText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -97,11 +92,15 @@ public class TestXSLFPowerPointExtractor {
assertEquals(slideText, text);
// Just notes, no slides
text = extractor.getText(false, true);
extractor.setSlidesByDefault(false);
extractor.setNotesByDefault(true);
text = extractor.getText();
assertEquals("\n\n1\n\n\n2\n", text);
// Both
text = extractor.getText(true, true, false);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
text = extractor.getText();
String bothText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -116,7 +115,10 @@ public class TestXSLFPowerPointExtractor {
assertEquals(bothText, text);
// With Slides and Master Text
text = extractor.getText(true, false, true);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
text = extractor.getText();
String smText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -131,7 +133,10 @@ public class TestXSLFPowerPointExtractor {
assertEquals(smText, text);
// With Slides, Notes and Master Text
text = extractor.getText(true, true, true);
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true);
extractor.setMasterByDefault(true);
text = extractor.getText();
String snmText =
"Lorem ipsum dolor sit amet\n" +
"Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
@ -150,14 +155,14 @@ public class TestXSLFPowerPointExtractor {
extractor.setNotesByDefault(true);
text = extractor.getText();
assertEquals("\n\n1\n\n\n2\n", text);
extractor.close();
xmlA.close();
}
}
@Test
public void testGetComments() throws IOException {
XMLSlideShow xml = openPPTX("45545_Comment.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
try (XMLSlideShow xml = openPPTX("45545_Comment.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
extractor.setCommentsByDefault(true);
String text = extractor.getText();
assertTrue(text.length() > 0);
@ -168,18 +173,19 @@ public class TestXSLFPowerPointExtractor {
// Check the authors came through too
assertContains(text, "XPVMWARE01");
extractor.close();
xml.close();
}
}
@Test
@Ignore("currently slidelayouts aren't yet supported")
public void testGetMasterText() throws Exception {
XMLSlideShow xml = openPPTX("WithMaster.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
try (XMLSlideShow xml = openPPTX("WithMaster.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(false);
extractor.setMasterByDefault(true);
String text = extractor.getText();
assertTrue(text.length() > 0);
@ -208,24 +214,20 @@ public class TestXSLFPowerPointExtractor {
"This is the Master Title\n" +
"This text comes from the Master Slide\n";
assertEquals(wholeText, text);
extractor.close();
xml.close();
}
}
@Test
public void testTable() throws Exception {
XMLSlideShow xml = openPPTX("present1.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
try (XMLSlideShow xml = openPPTX("present1.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText();
assertTrue(text.length() > 0);
// Check comments are there
assertContains(text, "TEST");
extractor.close();
xml.close();
}
}
/**
@ -241,8 +243,9 @@ public class TestXSLFPowerPointExtractor {
};
for(String extension : extensions) {
String filename = "testPPT." + extension;
XMLSlideShow xml = openPPTX(filename);
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
try (XMLSlideShow xml = openPPTX(filename);
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText();
if (extension.equals("thmx")) {
@ -257,58 +260,59 @@ public class TestXSLFPowerPointExtractor {
assertContains(filename, text, "content parsing");
assertContains(filename, text, "Different words to test against");
assertContains(filename, text, "Mystery");
extractor.close();
xml.close();
}
}
}
@Test
public void test45541() throws Exception {
public void test45541() throws IOException, OpenXML4JException, XmlException {
// extract text from a powerpoint that has a header in the notes-element
POITextExtractor extr = ExtractorFactory.createExtractor(
slTests.getFile("45541_Header.pptx"));
final File headerFile = slTests.getFile("45541_Header.pptx");
try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
String text = extr.getText();
assertNotNull(text);
assertFalse("Had: " + text, text.contains("testdoc"));
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
extr.setSlidesByDefault(false);
extr.setNotesByDefault(true);
text = extr.getText();
assertContains(text, "testdoc");
extr.close();
assertNotNull(text);
}
// extract text from a powerpoint that has a footer in the master-slide
extr = ExtractorFactory.createExtractor(
slTests.getFile("45541_Footer.pptx"));
final File footerFile = slTests.getFile("45541_Footer.pptx");
try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
String text = extr.getText();
assertNotContained(text, "testdoc");
extr.setSlidesByDefault(false);
extr.setNotesByDefault(true);
text = extr.getText();
assertNotContained(text, "testdoc");
text = ((XSLFPowerPointExtractor)extr).getText(false, true);
extr.setSlidesByDefault(false);
extr.setNotesByDefault(false);
extr.setMasterByDefault(true);
text = extr.getText();
assertNotContained(text, "testdoc");
text = ((XSLFPowerPointExtractor)extr).getText(false, false, true);
assertNotContained(text, "testdoc");
extr.close();
}
}
@Test
public void bug54570() throws IOException {
XMLSlideShow xml = openPPTX("bug54570.pptx");
XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(xml);
try (XMLSlideShow xml = openPPTX("bug54570.pptx");
SlideShowExtractor extractor = new SlideShowExtractor(xml)) {
String text = extractor.getText();
assertNotNull(text);
extractor.close();
xml.close();
}
}
private XMLSlideShow openPPTX(String file) throws IOException {
InputStream is = slTests.openResourceAsStream(file);
try {
try (InputStream is = slTests.openResourceAsStream(file)) {
return new XMLSlideShow(is);
} finally {
is.close();
}
}
}

View File

@ -38,6 +38,8 @@ import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory;
/**
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
@ -65,7 +67,7 @@ public class OLE2ScratchpadExtractorFactory {
}
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
return new PowerPointExtractor(poifsDir);
return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
}
if (poifsDir.hasEntry("VisioDocument")) {

View File

@ -34,6 +34,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Removal;
/**
* This class can be used to extract text from a PowerPoint file. Can optionally
@ -43,6 +44,7 @@ import org.apache.poi.sl.usermodel.SlideShowFactory;
*/
@SuppressWarnings("WeakerAccess")
@Deprecated
@Removal(version="5.0.0")
public final class PowerPointExtractor extends POIOLE2TextExtractor {
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;

View File

@ -1139,4 +1139,9 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
public void close() throws IOException {
_hslfSlideShow.close();
}
@Override
public Object getPersistDocument() {
return getSlideShowImpl();
}
}

View File

@ -19,8 +19,8 @@ package org.apache.poi.hslf.usermodel;
import java.io.IOException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Internal;
@ -31,12 +31,20 @@ import org.apache.poi.util.Internal;
@Internal
public class HSLFSlideShowFactory extends SlideShowFactory {
/**
* Creates a HSLFSlideShow from the given NPOIFSFileSystem
* <p>Note that in order to properly release resources the
* Creates a HSLFSlideShow from the given NPOIFSFileSystem<p>
* Note that in order to properly release resources the
* SlideShow should be closed after use.
*/
public static SlideShow<?,?> createSlideShow(NPOIFSFileSystem fs) throws IOException {
public static HSLFSlideShow createSlideShow(final NPOIFSFileSystem fs) throws IOException {
return new HSLFSlideShow(fs);
}
/**
* Creates a HSLFSlideShow from the given DirectoryNode<p>
* Note that in order to properly release resources the
* SlideShow should be closed after use.
*/
public static HSLFSlideShow createSlideShow(final DirectoryNode root) throws IOException {
return new HSLFSlideShow(root);
}
}

View File

@ -846,11 +846,15 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
@Override
public void close() throws IOException {
// only close the filesystem, if we are based on the root node.
// embedded documents/slideshows shouldn't close the parent container
if (getDirectory().getParent() == null) {
NPOIFSFileSystem fs = getDirectory().getFileSystem();
if (fs != null) {
fs.close();
}
}
}
@Override
protected String getEncryptedPropertyStreamName() {

View File

@ -42,6 +42,10 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.ObjectShape;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.IOUtils;
import org.junit.Test;
@ -76,43 +80,46 @@ public final class TestExtractor {
// ppe.close();
// }
private PowerPointExtractor openExtractor(String fileName) throws IOException {
InputStream is = slTests.openResourceAsStream(fileName);
try {
return new PowerPointExtractor(is);
} finally {
is.close();
private SlideShowExtractor<?,?> openExtractor(String fileName) throws IOException {
try (InputStream is = slTests.openResourceAsStream(fileName)) {
return new SlideShowExtractor(SlideShowFactory.create(is));
}
}
@Test
public void testReadSheetText() throws IOException {
// Basic 2 page example
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
assertEquals(expectText, ppe.getText());
ppe.close();
}
// 1 page example with text boxes
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
assertEquals(expectText2, ppe2.getText());
ppe2.close();
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
assertEquals(expectText2, ppe.getText());
}
}
@Test
public void testReadNoteText() throws IOException {
// Basic 2 page example
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
String notesText = ppe.getNotes();
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
ppe.setNotesByDefault(true);
ppe.setSlidesByDefault(false);
ppe.setMasterByDefault(false);
String notesText = ppe.getText();
String expText = "\nThese are the notes for page 1\n\nThese are the notes on page two, again lacking formatting\n";
assertEquals(expText, notesText);
ppe.close();
}
// Other one doesn't have notes
PowerPointExtractor ppe2 = openExtractor("with_textbox.ppt");
notesText = ppe2.getNotes();
expText = "";
try (SlideShowExtractor ppe = openExtractor("with_textbox.ppt")) {
ppe.setNotesByDefault(true);
ppe.setSlidesByDefault(false);
ppe.setMasterByDefault(false);
String notesText = ppe.getText();
String expText = "";
assertEquals(expText, notesText);
ppe2.close();
}
}
@Test
@ -126,7 +133,7 @@ public final class TestExtractor {
"\nThese are the notes on page two, again lacking formatting\n"
};
PowerPointExtractor ppe = openExtractor("basic_test_ppt_file.ppt");
try (SlideShowExtractor ppe = openExtractor("basic_test_ppt_file.ppt")) {
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(false);
assertEquals(slText[0] + slText[1], ppe.getText());
@ -138,7 +145,7 @@ public final class TestExtractor {
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(true);
assertEquals(slText[0] + ntText[0] + slText[1] + ntText[1], ppe.getText());
ppe.close();
}
}
/**
@ -149,10 +156,13 @@ public final class TestExtractor {
*/
@Test
public void testMissingCoreRecords() throws IOException {
PowerPointExtractor ppe = openExtractor("missing_core_records.ppt");
String text = ppe.getText(true, false);
String nText = ppe.getNotes();
try (SlideShowExtractor<?,?> ppe = openExtractor("missing_core_records.ppt")) {
ppe.setSlidesByDefault(true);
ppe.setNotesByDefault(false);
String text = ppe.getText();
ppe.setSlidesByDefault(false);
ppe.setNotesByDefault(true);
String nText = ppe.getText();
assertNotNull(text);
assertNotNull(nText);
@ -162,32 +172,30 @@ public final class TestExtractor {
// Slide records were fine
assertContains(text, "Using Disease Surveillance and Response");
ppe.close();
}
}
@Test
public void testExtractFromEmbeded() throws IOException {
InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
POIFSFileSystem fs = new POIFSFileSystem(is);
DirectoryNode root = fs.getRoot();
PowerPointExtractor ppe1 = assertExtractFromEmbedded(root, "MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n");
PowerPointExtractor ppe2 = assertExtractFromEmbedded(root, "MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n");
ppe2.close();
ppe1.close();
fs.close();
}
try (final InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
final POIFSFileSystem fs = new POIFSFileSystem(is)) {
final DirectoryNode root = fs.getRoot();
private PowerPointExtractor assertExtractFromEmbedded(DirectoryNode root, String entryName, String expected)
throws IOException {
DirectoryNode dir = (DirectoryNode)root.getEntry(entryName);
final String[] TEST_SET = {
"MBD0000A3B6", "Sample PowerPoint file\nThis is the 1st file\nNot much too it\n",
"MBD0000A3B3", "Sample PowerPoint file\nThis is the 2nd file\nNot much too it either\n"
};
for (int i=0; i<TEST_SET.length; i+=2) {
DirectoryNode dir = (DirectoryNode)root.getEntry(TEST_SET[i]);
assertTrue(dir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT));
// Check the first file
HSLFSlideShowImpl ppt = new HSLFSlideShowImpl(dir);
PowerPointExtractor ppe = new PowerPointExtractor(ppt);
assertEquals(expected, ppe.getText(true, false));
return ppe;
try (final SlideShow<?,?> ppt = SlideShowFactory.create(dir);
final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
assertEquals(TEST_SET[i+1], ppe.getText());
}
}
}
}
/**
@ -195,12 +203,12 @@ public final class TestExtractor {
*/
@Test
public void testExtractFromOwnEmbeded() throws IOException {
PowerPointExtractor ppe = openExtractor("ppt_with_embeded.ppt");
List<HSLFObjectShape> shapes = ppe.getOLEShapes();
try (SlideShowExtractor<?,?> ppe = openExtractor("ppt_with_embeded.ppt")) {
List<? extends ObjectShape> shapes = ppe.getOLEShapes();
assertEquals("Expected 6 ole shapes", 6, shapes.size());
int num_ppt = 0, num_doc = 0, num_xls = 0;
for (HSLFObjectShape ole : shapes) {
String name = ole.getInstanceName();
for (ObjectShape ole : shapes) {
String name = ((HSLFObjectShape)ole).getInstanceName();
InputStream data = ole.getObjectData().getInputStream();
if ("Worksheet".equals(name)) {
HSSFWorkbook wb = new HSSFWorkbook(data);
@ -220,7 +228,7 @@ public final class TestExtractor {
assertEquals("Expected 2 embedded Word Documents", 2, num_doc);
assertEquals("Expected 2 embedded Excel Spreadsheets", 2, num_xls);
assertEquals("Expected 2 embedded PowerPoint Presentations", 2, num_ppt);
ppe.close();
}
}
/**
@ -228,11 +236,11 @@ public final class TestExtractor {
*/
@Test
public void test52991() throws IOException {
PowerPointExtractor ppe = openExtractor("badzip.ppt");
for (HSLFObjectShape shape : ppe.getOLEShapes()) {
try (SlideShowExtractor<?,?> ppe = openExtractor("badzip.ppt")) {
for (ObjectShape shape : ppe.getOLEShapes()) {
IOUtils.copy(shape.getObjectData().getInputStream(), new ByteArrayOutputStream());
}
ppe.close();
}
}
/**
@ -240,27 +248,27 @@ public final class TestExtractor {
*/
@Test
public void testWithComments() throws IOException {
PowerPointExtractor ppe1 = openExtractor("WithComments.ppt");
String text = ppe1.getText();
try (final SlideShowExtractor ppe = openExtractor("WithComments.ppt")) {
String text = ppe.getText();
assertFalse("Comments not in by default", text.contains("This is a test comment"));
ppe1.setCommentsByDefault(true);
ppe.setCommentsByDefault(true);
text = ppe1.getText();
text = ppe.getText();
assertContains(text, "This is a test comment");
ppe1.close();
}
// And another file
PowerPointExtractor ppe2 = openExtractor("45543.ppt");
text = ppe2.getText();
try (SlideShowExtractor ppe = openExtractor("45543.ppt")) {
String text = ppe.getText();
assertFalse("Comments not in by default", text.contains("testdoc"));
ppe2.setCommentsByDefault(true);
ppe.setCommentsByDefault(true);
text = ppe2.getText();
text = ppe.getText();
assertContains(text, "testdoc");
ppe2.close();
}
}
/**
@ -268,48 +276,37 @@ public final class TestExtractor {
*/
@Test
public void testHeaderFooter() throws IOException {
String text;
// With a header on the notes
InputStream is1 = slTests.openResourceAsStream("45537_Header.ppt");
HSLFSlideShow ppt1 = new HSLFSlideShow(is1);
is1.close();
assertNotNull(ppt1.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt1.getNotesHeadersFooters().getHeaderText());
try (InputStream is = slTests.openResourceAsStream("45537_Header.ppt");
HSLFSlideShow ppt = new HSLFSlideShow(is)) {
PowerPointExtractor ppe1 = new PowerPointExtractor(ppt1.getSlideShowImpl());
assertNotNull(ppt.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getHeaderText());
text = ppe1.getText();
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
ppe1.setNotesByDefault(true);
text = ppe1.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
ppe1.close();
ppt1.close();
testHeaderFooterInner(ppt);
}
// And with a footer, also on notes
InputStream is2 = slTests.openResourceAsStream("45537_Footer.ppt");
HSLFSlideShow ppt2 = new HSLFSlideShow(is2);
is2.close();
try (final InputStream is = slTests.openResourceAsStream("45537_Footer.ppt");
final HSLFSlideShow ppt = new HSLFSlideShow(is)) {
assertNotNull(ppt.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt.getNotesHeadersFooters().getFooterText());
assertNotNull(ppt2.getNotesHeadersFooters());
assertEquals("testdoc test phrase", ppt2.getNotesHeadersFooters().getFooterText());
ppt2.close();
testHeaderFooterInner(ppt);
}
}
PowerPointExtractor ppe2 = openExtractor("45537_Footer.ppt");
text = ppe2.getText();
private void testHeaderFooterInner(final HSLFSlideShow ppt) throws IOException {
try (final SlideShowExtractor<?,?> ppe = new SlideShowExtractor(ppt)) {
String text = ppe.getText();
assertFalse("Header shouldn't be there by default\n" + text, text.contains("testdoc"));
assertFalse("Header shouldn't be there by default\n" + text, text.contains("test phrase"));
ppe2.setNotesByDefault(true);
text = ppe2.getText();
ppe.setNotesByDefault(true);
text = ppe.getText();
assertContains(text, "testdoc");
assertContains(text, "test phrase");
ppe2.close();
}
}
@SuppressWarnings("unused")
@ -318,41 +315,40 @@ public final class TestExtractor {
String masterTitleText = "This is the Master Title";
String masterRandomText = "This text comes from the Master Slide";
String masterFooterText = "Footer from the master slide";
PowerPointExtractor ppe = openExtractor("WithMaster.ppt");
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
ppe.setMasterByDefault(true);
String text = ppe.getText();
assertContains(text, masterRandomText);
assertContains(text, masterFooterText);
ppe.close();
}
}
@Test
public void testMasterText() throws IOException {
PowerPointExtractor ppe1 = openExtractor("master_text.ppt");
try (final SlideShowExtractor ppe = openExtractor("master_text.ppt")) {
// Initially not there
String text = ppe1.getText();
String text = ppe.getText();
assertFalse(text.contains("Text that I added to the master slide"));
// Enable, shows up
ppe1.setMasterByDefault(true);
text = ppe1.getText();
ppe.setMasterByDefault(true);
text = ppe.getText();
assertContains(text, "Text that I added to the master slide");
// Make sure placeholder text does not come out
assertNotContained(text, "Click to edit Master");
ppe1.close();
}
// Now with another file only containing master text
// Will always show up
PowerPointExtractor ppe2 = openExtractor("WithMaster.ppt");
try (final SlideShowExtractor ppe = openExtractor("WithMaster.ppt")) {
String masterText = "Footer from the master slide";
text = ppe2.getText();
String text = ppe.getText();
assertContainsIgnoreCase(text, "master");
assertContains(text, masterText);
ppe2.close();
}
}
/**
@ -360,8 +356,7 @@ public final class TestExtractor {
*/
@Test
public void testChineseText() throws IOException {
PowerPointExtractor ppe = openExtractor("54880_chinese.ppt");
try (final SlideShowExtractor ppe = openExtractor("54880_chinese.ppt")) {
String text = ppe.getText();
// Check for the english text line
@ -375,7 +370,7 @@ public final class TestExtractor {
// Check for the chinese only text line
assertContains(text, "\uff8a\uff9d\uff76\uff78");
ppe.close();
}
}
/**
@ -387,67 +382,59 @@ public final class TestExtractor {
public void testDifferentPOIFS() throws IOException {
// Open the two filesystems
File pptFile = slTests.getFile("basic_test_ppt_file.ppt");
InputStream is1 = new FileInputStream(pptFile);
OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
is1.close();
NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile);
try (final InputStream is1 = new FileInputStream(pptFile);
final NPOIFSFileSystem npoifs = new NPOIFSFileSystem(pptFile)) {
final OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is1);
DirectoryNode[] files = {opoifs.getRoot(), npoifs.getRoot()};
// Open directly
for (DirectoryNode dir : files) {
PowerPointExtractor extractor = new PowerPointExtractor(dir);
try (SlideShow<?,?> ppt = SlideShowFactory.create(dir);
SlideShowExtractor<?,?> extractor = new SlideShowExtractor(ppt)) {
assertEquals(expectText, extractor.getText());
}
// Open via a HSLFSlideShow
for (DirectoryNode dir : files) {
HSLFSlideShowImpl slideshow = new HSLFSlideShowImpl(dir);
PowerPointExtractor extractor = new PowerPointExtractor(slideshow);
assertEquals(expectText, extractor.getText());
extractor.close();
slideshow.close();
}
npoifs.close();
}
}
@Test
public void testTable() throws Exception {
PowerPointExtractor ppe1 = openExtractor("54111.ppt");
String text1 = ppe1.getText();
String target1 = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n"+
try (SlideShowExtractor ppe = openExtractor("54111.ppt")) {
String text = ppe.getText();
String target = "TH Cell 1\tTH Cell 2\tTH Cell 3\tTH Cell 4\n" +
"Row 1, Cell 1\tRow 1, Cell 2\tRow 1, Cell 3\tRow 1, Cell 4\n" +
"Row 2, Cell 1\tRow 2, Cell 2\tRow 2, Cell 3\tRow 2, Cell 4\n" +
"Row 3, Cell 1\tRow 3, Cell 2\tRow 3, Cell 3\tRow 3, Cell 4\n" +
"Row 4, Cell 1\tRow 4, Cell 2\tRow 4, Cell 3\tRow 4, Cell 4\n" +
"Row 5, Cell 1\tRow 5, Cell 2\tRow 5, Cell 3\tRow 5, Cell 4\n";
assertContains(text1, target1);
ppe1.close();
assertContains(text, target);
}
PowerPointExtractor ppe2 = openExtractor("54722.ppt");
String text2 = ppe2.getText();
try (SlideShowExtractor ppe = openExtractor("54722.ppt")) {
String text = ppe.getText();
String target2 = "this\tText\tis\twithin\ta\n" +
String target = "this\tText\tis\twithin\ta\n" +
"table\t1\t2\t3\t4";
assertContains(text2, target2);
ppe2.close();
assertContains(text, target);
}
}
// bug 60003
@Test
public void testExtractMasterSlideFooterText() throws Exception {
PowerPointExtractor ppe = openExtractor("60003.ppt");
try (SlideShowExtractor ppe = openExtractor("60003.ppt")) {
ppe.setMasterByDefault(true);
String text = ppe.getText();
assertContains(text, "Prague");
ppe.close();
}
}
@Test
public void testExtractGroupedShapeText() throws Exception {
try (final PowerPointExtractor ppe = openExtractor("bug62092.ppt")) {
try (final SlideShowExtractor ppe = openExtractor("bug62092.ppt")) {
final String text = ppe.getText();
//this tests that we're ignoring text shapes at depth=0

View File

@ -73,6 +73,7 @@ import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.sl.draw.DrawFactory;
import org.apache.poi.sl.draw.DrawPaint;
import org.apache.poi.sl.draw.DrawTextParagraph;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.ColorStyle;
import org.apache.poi.sl.usermodel.PaintStyle;
import org.apache.poi.sl.usermodel.PaintStyle.SolidPaint;
@ -800,18 +801,18 @@ public final class TestBugs {
String files[] = { "bug58718_008524.ppt","bug58718_008558.ppt","bug58718_349008.ppt","bug58718_008495.ppt", };
for (String f : files) {
File sample = HSLFTestDataSamples.getSampleFile(f);
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
assertNotNull(ex.getText());
ex.close();
}
}
}
@Test
public void bug58733() throws IOException {
File sample = HSLFTestDataSamples.getSampleFile("bug58733_671884.ppt");
PowerPointExtractor ex = new PowerPointExtractor(sample.getAbsolutePath());
try (SlideShowExtractor ex = new SlideShowExtractor(SlideShowFactory.create(sample))) {
assertNotNull(ex.getText());
ex.close();
}
}
@Test

Binary file not shown.