#64411 - Provide JigSaw modules

- rework extractors - see bugzilla entry for more information

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1880839 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2020-08-13 21:08:24 +00:00
parent 4bf968d6bd
commit dfdf9e6d6f
63 changed files with 1802 additions and 1791 deletions

View File

@ -29,11 +29,11 @@ import java.util.HashSet;
import java.util.Set; import java.util.Set;
import org.apache.poi.EncryptedDocumentException; import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.ss.extractor.ExcelExtractor; import org.apache.poi.ss.extractor.ExcelExtractor;
import org.apache.poi.util.IOUtils; import org.apache.poi.util.IOUtils;

View File

@ -23,7 +23,7 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlideShow;
@ -53,7 +53,8 @@ public class XSLFFileHandler extends SlideShowHandler {
// additionally try the other getText() methods // additionally try the other getText() methods
try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) { //noinspection rawtypes
try (SlideShowExtractor extractor = (SlideShowExtractor) ExtractorFactory.createExtractor(file)) {
assertNotNull(extractor); assertNotNull(extractor);
extractor.setSlidesByDefault(true); extractor.setSlidesByDefault(true);
extractor.setNotesByDefault(true); extractor.setNotesByDefault(true);

View File

@ -0,0 +1,304 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor;
import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword;
import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.ServiceLoader;
import java.util.stream.StreamSupport;
import org.apache.poi.EmptyFileException;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* Figures out the correct POIOLE2TextExtractor for your supplied
* document, and returns it.
*
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
* not present on the runtime classpath</p>
* <p>Note 2 - for text extractor creation across all formats, use
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
* the OOXML jar.</p>
* <p>Note 3 - rather than using this, for most cases you would be better
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
*/
@SuppressWarnings({"WeakerAccess", "JavadocReference"})
public final class ExtractorFactory {
private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class);
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
/** Should all threads prefer event based over usermodel based extractors? */
private static Boolean allPreferEventExtractors;
private static class Singleton {
private static final ExtractorFactory INSTANCE = new ExtractorFactory();
}
private interface ProviderMethod {
POITextExtractor create(ExtractorProvider prov) throws IOException;
}
private final List<ExtractorProvider> provider = new ArrayList<>();
private ExtractorFactory() {
ServiceLoader.load(ExtractorProvider.class).forEach(provider::add);
}
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*
* @return true if event extractors should be preferred in the current thread, fals otherwise.
*/
public static boolean getThreadPrefersEventExtractors() {
return threadPreferEventExtractors.get();
}
/**
* Should all threads prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is to use the thread level setting, which defaults to false.
*
* @return true if event extractors should be preferred in all threads, fals otherwise.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
return allPreferEventExtractors;
}
/**
* Should this thread prefer event based over usermodel based extractors?
* Will only be used if the All Threads setting is null.
*
* @param preferEventExtractors If this threads should prefer event based extractors.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
threadPreferEventExtractors.set(preferEventExtractors);
}
/**
* Should all threads prefer event based over usermodel based extractors?
* If set, will take preference over the Thread level setting.
*
* @param preferEventExtractors If all threads should prefer event based extractors.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
allPreferEventExtractors = preferEventExtractors;
}
/**
* Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific.
*
* @return If the current thread should use event based extractors.
*/
public static boolean getPreferEventExtractor() {
return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get();
}
public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
return createExtractor(fs, getCurrentUserPassword());
}
public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException {
return createExtractor(fs.getRoot(), password);
}
public static POITextExtractor createExtractor(InputStream input) throws IOException {
return createExtractor(input, getCurrentUserPassword());
}
public static POITextExtractor createExtractor(InputStream input, String password) throws IOException {
final InputStream is = FileMagic.prepareToCheckMagic(input);
byte[] emptyFileCheck = new byte[1];
is.mark(emptyFileCheck.length);
if (is.read(emptyFileCheck) < emptyFileCheck.length) {
throw new EmptyFileException();
}
is.reset();
final FileMagic fm = FileMagic.valueOf(is);
if (FileMagic.OOXML == fm) {
return wp(fm, w -> w.create(is, password));
}
if (FileMagic.OLE2 != fm) {
throw new IOException("Can't create extractor - unsupported file type: "+fm);
}
POIFSFileSystem poifs = new POIFSFileSystem(is);
boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
}
public static POITextExtractor createExtractor(File file) throws IOException {
return createExtractor(file, getCurrentUserPassword());
}
public static POITextExtractor createExtractor(File file, String password) throws IOException {
if (file.length() == 0) {
throw new EmptyFileException();
}
final FileMagic fm = FileMagic.valueOf(file);
if (FileMagic.OOXML == fm) {
return wp(fm, w -> w.create(file, password));
}
if (FileMagic.OLE2 != fm) {
throw new IOException("Can't create extractor - unsupported file type: "+fm);
}
POIFSFileSystem poifs = new POIFSFileSystem(file, true);
try {
boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
} catch (IOException | RuntimeException e) {
IOUtils.closeQuietly(poifs);
throw e;
}
}
/**
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
* Note that this won't check for embedded OOXML resources either, use
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
*
* @param root The {@link DirectoryNode} pointing to a document.
*
* @return The resulting {@link POITextExtractor}, an exception is thrown if
* no TextExtractor can be created for some reason.
*
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
* an unsupported version of Excel.
* @throws IllegalArgumentException If creating the Extractor fails
*/
public static POITextExtractor createExtractor(DirectoryNode root) throws IOException {
return createExtractor(root, getCurrentUserPassword());
}
public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException {
// Encrypted OOXML files go inside OLE2 containers, is this one?
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry("Package")) {
return wp(FileMagic.OOXML, w -> w.create(root, password));
} else {
return wp(FileMagic.OLE2, w -> w.create(root, password));
}
}
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*
* @param ext The extractor to look at for embedded documents
*
* @return An array of resulting extractors. Empty if no embedded documents are found.
*
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
* an unsupported version of Excel.
* @throws IllegalArgumentException If creating the Extractor fails
*/
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
if (ext == null) {
throw new IllegalStateException("extractor must be given");
}
// All the embedded directories we spotted
List<Entry> dirs = new ArrayList<>();
// For anything else not directly held in as a POIFS directory
List<InputStream> nonPOIFS = new ArrayList<>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if(root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if(ext instanceof ExcelExtractor) {
// These are in MBD... under the root
StreamSupport.stream(root.spliterator(), false)
.filter(entry -> entry.getName().startsWith("MBD"))
.forEach(dirs::add);
} else {
for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
if (prov.accepts(FileMagic.OLE2)) {
prov.identifyEmbeddedResources(ext, dirs, nonPOIFS);
break;
}
}
}
// Create the extractors
if(dirs.size() == 0 && nonPOIFS.size() == 0){
return new POITextExtractor[0];
}
ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
for (Entry dir : dirs) {
textExtractors.add(createExtractor((DirectoryNode) dir));
}
for (InputStream stream : nonPOIFS) {
try {
textExtractors.add(createExtractor(stream));
} catch (IOException e) {
// Ignore, just means it didn't contain a format we support as yet
LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
}
}
return textExtractors.toArray(new POITextExtractor[0]);
}
private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException {
for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
if (prov.accepts(fm)) {
POITextExtractor ext = fun.create(prov);
if (ext != null) {
return ext;
}
}
}
throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " +
"or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm);
}
}

View File

@ -0,0 +1,76 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.FileMagic;
public interface ExtractorProvider {
boolean accepts(FileMagic fm);
/**
* Create Extractor via file
* @param file the file
* @param password the password or {@code null} if not encrypted
* @return the extractor
* @throws IOException if file can't be read or parsed
*/
POITextExtractor create(File file, String password) throws IOException;
/**
* Create Extractor via InputStream
* @param inputStream the stream
* @param password the password or {@code null} if not encrypted
* @return the extractor
* @throws IOException if stream can't be read or parsed
*/
POITextExtractor create(InputStream inputStream, String password) throws IOException;
/**
* Create Extractor from POIFS node
* @param poifsDir the node
* @param password the password or {@code null} if not encrypted
* @return the extractor
* @throws IOException if node can't be parsed
*/
POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException;
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*
* @param ext the extractor holding the directory to start parsing
* @param dirs a list to be filled with directory references holding embedded
* @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
*
* @throws IOException when the format specific extraction fails because of invalid entires
*/
default void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources");
}
}

View File

@ -0,0 +1,76 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor;
import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.extractor.OldExcelExtractor;
import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* ExtractorFactory for HSSF and Old Excel format
*/
public class MainExtractorFactory implements ExtractorProvider {
@Override
public boolean accepts(FileMagic fm) {
return FileMagic.OLE2 == fm;
}
@Override
public POITextExtractor create(File file, String password) throws IOException {
return create(new POIFSFileSystem(file, true).getRoot(), password);
}
@Override
public POITextExtractor create(InputStream inputStream, String password) throws IOException {
return create(new POIFSFileSystem(inputStream).getRoot(), password);
}
@Override
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
try {
Biff8EncryptionKey.setCurrentUserPassword(password);
// Look for certain entries in the stream, to figure it out from
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
if (poifsDir.hasEntry(workbookName)) {
return ExtractorFactory.getPreferEventExtractor() ? new EventBasedExcelExtractor(poifsDir) : new ExcelExtractor(poifsDir);
}
}
if (poifsDir.hasEntry(InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME)) {
return new OldExcelExtractor(poifsDir);
}
} finally {
Biff8EncryptionKey.setCurrentUserPassword(oldPW);
}
return null;
}
}

View File

@ -1,279 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.extractor;
import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME;
import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* Figures out the correct POIOLE2TextExtractor for your supplied
* document, and returns it.
*
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
* not present on the runtime classpath</p>
* <p>Note 2 - for text extractor creation across all formats, use
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
* the OOXML jar.</p>
* <p>Note 3 - rather than using this, for most cases you would be better
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
*/
@SuppressWarnings({"WeakerAccess", "JavadocReference"})
public final class OLE2ExtractorFactory {
private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
/** Should all threads prefer event based over usermodel based extractors? */
private static Boolean allPreferEventExtractors;
private OLE2ExtractorFactory() {
}
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*
* @return true if event extractors should be preferred in the current thread, fals otherwise.
*/
public static boolean getThreadPrefersEventExtractors() {
return threadPreferEventExtractors.get();
}
/**
* Should all threads prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is to use the thread level setting, which defaults to false.
*
* @return true if event extractors should be preferred in all threads, fals otherwise.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
return allPreferEventExtractors;
}
/**
* Should this thread prefer event based over usermodel based extractors?
* Will only be used if the All Threads setting is null.
*
* @param preferEventExtractors If this threads should prefer event based extractors.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
threadPreferEventExtractors.set(preferEventExtractors);
}
/**
* Should all threads prefer event based over usermodel based extractors?
* If set, will take preference over the Thread level setting.
*
* @param preferEventExtractors If all threads should prefer event based extractors.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
allPreferEventExtractors = preferEventExtractors;
}
/**
* Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific.
*
* @return If the current thread should use event based extractors.
*/
public static boolean getPreferEventExtractor() {
if(allPreferEventExtractors != null) {
return allPreferEventExtractors;
}
return threadPreferEventExtractors.get();
}
@SuppressWarnings("unchecked")
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
return (T)createExtractor(fs.getRoot());
}
@SuppressWarnings("unchecked")
public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
Class<?> cls = getOOXMLClass();
if (cls != null) {
// Use Reflection to get us the full OOXML-enabled version
try {
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
return (T)m.invoke(null, input);
} catch (IllegalArgumentException iae) {
throw iae;
} catch (Exception e) {
throw new IllegalArgumentException("Error creating Extractor for InputStream", e);
}
} else {
// Best hope it's OLE2....
return createExtractor(new POIFSFileSystem(input));
}
}
private static Class<?> getOOXMLClass() {
try {
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
"org.apache.poi.extractor.ExtractorFactory"
);
} catch (ClassNotFoundException e) {
LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
return null;
}
}
private static Class<?> getScratchpadClass() {
try {
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
"org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory"
);
} catch (ClassNotFoundException e) {
LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
}
}
/**
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
* Note that this won't check for embedded OOXML resources either, use
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
*
* @param poifsDir The {@link DirectoryNode} pointing to a document.
*
* @return The resulting {@link POITextExtractor}, an exception is thrown if
* no TextExtractor can be created for some reason.
*
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
* an unsupported version of Excel.
* @throws IllegalArgumentException If creating the Extractor fails
*/
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
// Look for certain entries in the stream, to figure it
// out from
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
if (poifsDir.hasEntry(workbookName)) {
if (getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir);
}
return new ExcelExtractor(poifsDir);
}
}
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
+ "found. Please call OldExcelExtractor directly for basic text extraction");
}
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
if (ext != null) return ext;
} catch (IllegalArgumentException iae) {
throw iae;
} catch (Exception e) {
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*
* @param ext The extractor to look at for embedded documents
*
* @return An array of resulting extractors. Empty if no embedded documents are found.
*
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
* an unsupported version of Excel.
* @throws IllegalArgumentException If creating the Extractor fails
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
// All the embedded directories we spotted
List<Entry> dirs = new ArrayList<>();
// For anything else not directly held in as a POIFS directory
List<InputStream> nonPOIFS = new ArrayList<>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if(root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
if(ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator<Entry> it = root.getEntries();
while(it.hasNext()) {
Entry entry = it.next();
if(entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else {
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod(
"identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
m.invoke(null, ext, dirs, nonPOIFS);
} catch (Exception e) {
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
}
}
// Create the extractors
if(dirs.size() == 0 && nonPOIFS.size() == 0){
return new POITextExtractor[0];
}
ArrayList<POITextExtractor> e = new ArrayList<>();
for (Entry dir : dirs) {
e.add(createExtractor((DirectoryNode) dir
));
}
for (InputStream stream : nonPOIFS) {
try {
e.add(createExtractor(stream));
} catch (Exception xe) {
// Ignore, invalid format
LOGGER.log(POILogger.WARN, xe);
}
}
return e.toArray(new POITextExtractor[0]);
}
}

View File

@ -30,55 +30,28 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
* org.apache.poi.[format].extractor . * org.apache.poi.[format].extractor .
* *
* @see org.apache.poi.hssf.extractor.ExcelExtractor * @see org.apache.poi.hssf.extractor.ExcelExtractor
* @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor * @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor * @see org.apache.poi.hwpf.extractor.WordExtractor
*/ */
public abstract class POIOLE2TextExtractor extends POITextExtractor { public interface POIOLE2TextExtractor extends POITextExtractor {
/** The POIDocument that's open */
protected POIDocument document;
/**
* Creates a new text extractor for the given document
*
* @param document The POIDocument to use in this extractor.
*/
public POIOLE2TextExtractor(POIDocument document) {
this.document = document;
// Ensure any underlying resources, such as open files,
// will get cleaned up if the user calls #close()
setFilesystem(document);
}
/**
* Creates a new text extractor, using the same
* document as another text extractor. Normally
* only used by properties extractors.
*
* @param otherExtractor the extractor which document to be used
*/
protected POIOLE2TextExtractor(POIOLE2TextExtractor otherExtractor) {
this.document = otherExtractor.document;
}
/** /**
* Returns the document information metadata for the document * Returns the document information metadata for the document
* *
* @return The Document Summary Information or null * @return The Document Summary Information or null
* if it could not be read for this document. * if it could not be read for this document.
*/ */
public DocumentSummaryInformation getDocSummaryInformation() { default DocumentSummaryInformation getDocSummaryInformation() {
return document.getDocumentSummaryInformation(); return getDocument().getDocumentSummaryInformation();
} }
/** /**
* Returns the summary information metadata for the document. * Returns the summary information metadata for the document.
* *
* @return The Summary information for the document or null * @return The Summary information for the document or null
* if it could not be read for this document. * if it could not be read for this document.
*/ */
public SummaryInformation getSummaryInformation() { default SummaryInformation getSummaryInformation() {
return document.getSummaryInformation(); return getDocument().getSummaryInformation();
} }
/** /**
@ -88,7 +61,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
* @return an instance of POIExtractor that can extract meta-data. * @return an instance of POIExtractor that can extract meta-data.
*/ */
@Override @Override
public POITextExtractor getMetadataTextExtractor() { default POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this); return new HPSFPropertiesExtractor(this);
} }
@ -97,8 +70,8 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
* *
* @return the DirectoryEntry that is associated with the POIDocument of this extractor. * @return the DirectoryEntry that is associated with the POIDocument of this extractor.
*/ */
public DirectoryEntry getRoot() { default DirectoryEntry getRoot() {
return document.getDirectory(); return getDocument().getDirectory();
} }
/** /**
@ -107,7 +80,5 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
* @return the underlying POIDocument * @return the underlying POIDocument
*/ */
@Override @Override
public POIDocument getDocument() { POIDocument getDocument();
return document;
}
} }

View File

@ -27,13 +27,10 @@ import java.io.IOException;
* org.apache.poi.[format].extractor . * org.apache.poi.[format].extractor .
* *
* @see org.apache.poi.hssf.extractor.ExcelExtractor * @see org.apache.poi.hssf.extractor.ExcelExtractor
* @see org.apache.poi.hslf.extractor.PowerPointExtractor
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor * @see org.apache.poi.hdgf.extractor.VisioTextExtractor
* @see org.apache.poi.hwpf.extractor.WordExtractor * @see org.apache.poi.hwpf.extractor.WordExtractor
*/ */
public abstract class POITextExtractor implements Closeable { public interface POITextExtractor extends Closeable {
private Closeable fsToClose;
/** /**
* Retrieves all the text from the document. * Retrieves all the text from the document.
* How cells, paragraphs etc are separated in the text * How cells, paragraphs etc are separated in the text
@ -41,7 +38,7 @@ public abstract class POITextExtractor implements Closeable {
* a specific project for details. * a specific project for details.
* @return All the text from the document * @return All the text from the document
*/ */
public abstract String getText(); String getText();
/** /**
* Returns another text extractor, which is able to * Returns another text extractor, which is able to
@ -50,16 +47,23 @@ public abstract class POITextExtractor implements Closeable {
* *
* @return the metadata and text extractor * @return the metadata and text extractor
*/ */
public abstract POITextExtractor getMetadataTextExtractor(); POITextExtractor getMetadataTextExtractor();
/** /**
* Used to ensure file handle cleanup. * @param doCloseFilesystem {@code true} (default), if underlying resources/filesystem should be
* * closed on {@link #close()}
* @param fs filesystem to close
*/ */
public void setFilesystem(Closeable fs) { void setCloseFilesystem(boolean doCloseFilesystem);
fsToClose = fs;
} /**
* @return {@code true}, if resources/filesystem should be closed on {@link #close()}
*/
boolean isCloseFilesystem();
/**
* @return The underlying resources/filesystem
*/
Closeable getFilesystem();
/** /**
* Allows to free resources of the Extractor as soon as * Allows to free resources of the Extractor as soon as
@ -69,14 +73,15 @@ public abstract class POITextExtractor implements Closeable {
* The Extractor cannot be used after close has been called. * The Extractor cannot be used after close has been called.
*/ */
@Override @Override
public void close() throws IOException { default void close() throws IOException {
if(fsToClose != null) { Closeable fs = getFilesystem();
fsToClose.close(); if (isCloseFilesystem() && fs != null) {
fs.close();
} }
} }
/** /**
* @return the processed document * @return the processed document
*/ */
public abstract Object getDocument(); Object getDocument();
} }

View File

@ -17,9 +17,6 @@
package org.apache.poi.hpsf.extractor; package org.apache.poi.hpsf.extractor;
import java.io.File;
import java.io.IOException;
import org.apache.poi.POIDocument; import org.apache.poi.POIDocument;
import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.extractor.POITextExtractor;
@ -37,15 +34,20 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* build in and custom, returning them in * build in and custom, returning them in
* textual form. * textual form.
*/ */
public class HPSFPropertiesExtractor extends POIOLE2TextExtractor { public class HPSFPropertiesExtractor implements POIOLE2TextExtractor {
private final POIDocument document;
private boolean doCloseFilesystem = true;
public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) { public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) {
super(mainExtractor); document = mainExtractor.getDocument();
} }
public HPSFPropertiesExtractor(POIDocument doc) {
super(doc); public HPSFPropertiesExtractor(POIDocument document) {
this.document = document;
} }
public HPSFPropertiesExtractor(POIFSFileSystem fs) { public HPSFPropertiesExtractor(POIFSFileSystem fs) {
super(new HPSFPropertiesOnlyDocument(fs)); document = new HPSFPropertiesOnlyDocument(fs);
} }
public String getDocumentSummaryInformationText() { public String getDocumentSummaryInformationText() {
@ -137,12 +139,23 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
return super.hashCode(); return super.hashCode();
} }
public static void main(String[] args) throws IOException { @Override
for (String file : args) { public POIDocument getDocument() {
try (HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor( return document;
new POIFSFileSystem(new File(file)))) { }
System.out.println(ext.getText());
} @Override
} public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public POIDocument getFilesystem() {
return document;
} }
} }

View File

@ -17,6 +17,7 @@
package org.apache.poi.hssf.extractor; package org.apache.poi.hssf.extractor;
import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -37,9 +38,9 @@ import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NoteRecord; import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -59,26 +60,28 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* *
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a> * @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
*/ */
public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { public class EventBasedExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
private DirectoryNode _dir; private final POIFSFileSystem poifs;
private final DirectoryNode _dir;
private boolean doCloseFilesystem = true;
boolean _includeSheetNames = true; boolean _includeSheetNames = true;
boolean _formulasNotResults; boolean _formulasNotResults;
public EventBasedExcelExtractor( DirectoryNode dir ) public EventBasedExcelExtractor(DirectoryNode dir) {
{ poifs = null;
super( (POIDocument)null );
_dir = dir; _dir = dir;
} }
public EventBasedExcelExtractor(POIFSFileSystem fs) { public EventBasedExcelExtractor(POIFSFileSystem fs) {
this(fs.getRoot()); poifs = fs;
super.setFilesystem(fs); _dir = fs.getRoot();
} }
/** /**
* Would return the document information metadata for the document, * Would return the document information metadata for the document,
* if we supported it * if we supported it
*/ */
@Override
public DocumentSummaryInformation getDocSummaryInformation() { public DocumentSummaryInformation getDocSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
} }
@ -86,6 +89,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or
* Would return the summary information metadata for the document, * Would return the summary information metadata for the document,
* if we supported it * if we supported it
*/ */
@Override
public SummaryInformation getSummaryInformation() { public SummaryInformation getSummaryInformation() {
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor"); throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
} }
@ -262,4 +266,29 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or
} }
} }
} }
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public Closeable getFilesystem() {
return poifs;
}
@Override
public POIDocument getDocument() {
return null;
}
@Override
public DirectoryEntry getRoot() {
return _dir;
}
} }

View File

@ -53,9 +53,10 @@ import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
* *
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a> * @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
*/ */
public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor { public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
private final HSSFWorkbook _wb; private final HSSFWorkbook _wb;
private final HSSFDataFormatter _formatter; private final HSSFDataFormatter _formatter;
private boolean doCloseFilesystem = true;
private boolean _includeSheetNames = true; private boolean _includeSheetNames = true;
private boolean _shouldEvaluateFormulas = true; private boolean _shouldEvaluateFormulas = true;
private boolean _includeCellComments; private boolean _includeCellComments;
@ -63,13 +64,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
private boolean _includeHeadersFooters = true; private boolean _includeHeadersFooters = true;
public ExcelExtractor(HSSFWorkbook wb) { public ExcelExtractor(HSSFWorkbook wb) {
super(wb);
_wb = wb; _wb = wb;
_formatter = new HSSFDataFormatter(); _formatter = new HSSFDataFormatter();
} }
public ExcelExtractor(POIFSFileSystem fs) throws IOException { public ExcelExtractor(POIFSFileSystem fs) throws IOException {
this(fs.getRoot()); this(fs.getRoot());
} }
public ExcelExtractor(DirectoryNode dir) throws IOException { public ExcelExtractor(DirectoryNode dir) throws IOException {
this(new HSSFWorkbook(dir, true)); this(new HSSFWorkbook(dir, true));
} }
@ -225,7 +227,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile()); try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile());
HSSFWorkbook wb = new HSSFWorkbook(is); HSSFWorkbook wb = new HSSFWorkbook(is);
ExcelExtractor extractor = new ExcelExtractor(wb); ExcelExtractor extractor = new ExcelExtractor(wb)
) { ) {
extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames()); extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames());
extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas()); extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas());
@ -411,4 +413,24 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
return text.toString(); return text.toString();
} }
@Override
public HSSFWorkbook getDocument() {
return _wb;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public HSSFWorkbook getFilesystem() {
return _wb;
}
} }

View File

@ -29,6 +29,7 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.EncryptedDocumentException; import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.OldExcelFormatException; import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.CodepageRecord; import org.apache.poi.hssf.record.CodepageRecord;
@ -58,7 +59,7 @@ import org.apache.poi.util.IOUtils;
* by Apache Tika, but not really intended for display to the user. * by Apache Tika, but not really intended for display to the user.
* </p> * </p>
*/ */
public class OldExcelExtractor implements Closeable { public class OldExcelExtractor implements POITextExtractor {
private final static int FILE_PASS_RECORD_SID = 0x2f; private final static int FILE_PASS_RECORD_SID = 0x2f;
//arbitrarily selected; may need to increase //arbitrarily selected; may need to increase
@ -295,24 +296,39 @@ public class OldExcelExtractor implements Closeable {
} }
} }
close();
ris = null; ris = null;
return text.toString(); return text.toString();
} }
@Override
public void close() {
// some cases require this close here
if(toClose != null) {
IOUtils.closeQuietly(toClose);
toClose = null;
}
}
protected void handleNumericCell(StringBuilder text, double value) { protected void handleNumericCell(StringBuilder text, double value) {
// TODO Need to fetch / use format strings // TODO Need to fetch / use format strings
text.append(value); text.append(value);
text.append('\n'); text.append('\n');
} }
@Override
public POITextExtractor getMetadataTextExtractor() {
return null;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
}
@Override
public boolean isCloseFilesystem() {
return toClose != null;
}
@Override
public Closeable getFilesystem() {
return toClose;
}
@Override
public Object getDocument() {
return ris;
}
} }

View File

@ -54,14 +54,14 @@ import org.apache.poi.util.POILogger;
public class SlideShowExtractor< public class SlideShowExtractor<
S extends Shape<S,P>, S extends Shape<S,P>,
P extends TextParagraph<S,P,? extends TextRun> P extends TextParagraph<S,P,? extends TextRun>
> extends POITextExtractor { > implements POITextExtractor {
private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class); private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class);
// placeholder text for slide numbers // placeholder text for slide numbers
private static final String SLIDE_NUMBER_PH = "#"; private static final String SLIDE_NUMBER_PH = "#";
private SlideShow<S,P> slideshow; protected final SlideShow<S,P> slideshow;
private boolean slidesByDefault = true; private boolean slidesByDefault = true;
private boolean notesByDefault; private boolean notesByDefault;
@ -69,9 +69,9 @@ public class SlideShowExtractor<
private boolean masterByDefault; private boolean masterByDefault;
private Predicate<Object> filter = o -> true; private Predicate<Object> filter = o -> true;
private boolean doCloseFilesystem = true;
public SlideShowExtractor(final SlideShow<S,P> slideshow) { public SlideShowExtractor(final SlideShow<S,P> slideshow) {
setFilesystem(slideshow);
this.slideshow = slideshow; this.slideshow = slideshow;
} }
@ -81,8 +81,8 @@ public class SlideShowExtractor<
* @return the opened document * @return the opened document
*/ */
@Override @Override
public final Object getDocument() { public SlideShow<S,P> getDocument() {
return slideshow.getPersistDocument(); return slideshow;
} }
/** /**
@ -339,17 +339,17 @@ public class SlideShowExtractor<
return raw; return raw;
} }
TextParagraph tp = tr.getParagraph(); TextParagraph<?,?,?> tp = tr.getParagraph();
TextShape ps = (tp != null) ? tp.getParentShape() : null; TextShape<?,?> ps = (tp != null) ? tp.getParentShape() : null;
Sheet sh = (ps != null) ? ps.getSheet() : null; Sheet<?,?> sh = (ps != null) ? ps.getSheet() : null;
String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : ""; String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide<?,?>)sh).getSlideNumber() + 1) : "";
return raw.replace(SLIDE_NUMBER_PH, slideNr); return raw.replace(SLIDE_NUMBER_PH, slideNr);
} }
private static String replaceTextCap(TextRun tr) { private static String replaceTextCap(TextRun tr) {
final TextParagraph tp = tr.getParagraph(); final TextParagraph<?,?,?> tp = tr.getParagraph();
final TextShape sh = (tp != null) ? tp.getParentShape() : null; final TextShape<?,?> sh = (tp != null) ? tp.getParentShape() : null;
final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null; final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null;
// 0xB acts like cariage return in page titles and like blank in the others // 0xB acts like cariage return in page titles and like blank in the others
@ -438,4 +438,19 @@ public class SlideShowExtractor<
(italic == null || tr.isItalic() == italic) && (italic == null || tr.isItalic() == italic) &&
(bold == null || tr.isBold() == bold); (bold == null || tr.isBold() == bold);
} }
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public SlideShow<S,P> getFilesystem() {
return getDocument();
}
} }

View File

@ -27,7 +27,7 @@ public interface ExcelExtractor {
* *
* @param includeSheetNames {@code true} if the sheet names should be included * @param includeSheetNames {@code true} if the sheet names should be included
*/ */
public void setIncludeSheetNames(boolean includeSheetNames); void setIncludeSheetNames(boolean includeSheetNames);
/** /**
* Should we return the formula itself, and not the result it produces? * Should we return the formula itself, and not the result it produces?
@ -35,7 +35,7 @@ public interface ExcelExtractor {
* *
* @param formulasNotResults {@code true} if the formula itself is returned * @param formulasNotResults {@code true} if the formula itself is returned
*/ */
public void setFormulasNotResults(boolean formulasNotResults); void setFormulasNotResults(boolean formulasNotResults);
/** /**
* Should headers and footers be included in the output? * Should headers and footers be included in the output?
@ -43,7 +43,7 @@ public interface ExcelExtractor {
* *
* @param includeHeadersFooters {@code true} if headers and footers should be included * @param includeHeadersFooters {@code true} if headers and footers should be included
*/ */
public void setIncludeHeadersFooters(boolean includeHeadersFooters); void setIncludeHeadersFooters(boolean includeHeadersFooters);
/** /**
* Should cell comments be included? * Should cell comments be included?
@ -51,12 +51,12 @@ public interface ExcelExtractor {
* *
* @param includeCellComments {@code true} if cell comments should be included * @param includeCellComments {@code true} if cell comments should be included
*/ */
public void setIncludeCellComments(boolean includeCellComments); void setIncludeCellComments(boolean includeCellComments);
/** /**
* Retrieves the text contents of the file * Retrieves the text contents of the file
* *
* @return the text contents of the file * @return the text contents of the file
*/ */
public String getText(); String getText();
} }

View File

@ -29,6 +29,7 @@ module org.apache.poi.ooxml {
requires java.security.jgss; requires java.security.jgss;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory; provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
exports org.apache.poi.xwpf.extractor; exports org.apache.poi.xwpf.extractor;
exports org.apache.poi.xwpf.usermodel; exports org.apache.poi.xwpf.usermodel;

View File

@ -29,6 +29,7 @@ module org.apache.poi.ooxml {
requires java.security.jgss; requires java.security.jgss;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory; provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
exports org.apache.poi.xwpf.extractor; exports org.apache.poi.xwpf.extractor;
exports org.apache.poi.xwpf.usermodel; exports org.apache.poi.xwpf.usermodel;

View File

@ -28,8 +28,12 @@ module org.apache.poi.poi {
requires jdk.unsupported; requires jdk.unsupported;
uses org.apache.poi.ss.usermodel.WorkbookProvider; uses org.apache.poi.ss.usermodel.WorkbookProvider;
uses org.apache.poi.extractor.ExtractorProvider;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory; provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
exports org.apache.poi; exports org.apache.poi;
exports org.apache.poi.common; exports org.apache.poi.common;

View File

@ -28,8 +28,10 @@ module org.apache.poi.poi {
requires jdk.unsupported; requires jdk.unsupported;
uses org.apache.poi.ss.usermodel.WorkbookProvider; uses org.apache.poi.ss.usermodel.WorkbookProvider;
uses org.apache.poi.extractor.ExtractorProvider;
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory; provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
exports org.apache.poi; exports org.apache.poi;
exports org.apache.poi.common; exports org.apache.poi.common;

View File

@ -20,6 +20,8 @@ module org.apache.poi.scratchpad {
requires java.desktop; requires java.desktop;
requires commons.math3; requires commons.math3;
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
exports org.apache.poi.hmef; exports org.apache.poi.hmef;
exports org.apache.poi.hmef.dev; exports org.apache.poi.hmef.dev;
exports org.apache.poi.hmef.extractor; exports org.apache.poi.hmef.extractor;

View File

@ -20,6 +20,8 @@ module org.apache.poi.scratchpad {
requires java.desktop; requires java.desktop;
requires commons.math3; requires commons.math3;
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
exports org.apache.poi.hmef; exports org.apache.poi.hmef;
exports org.apache.poi.hmef.dev; exports org.apache.poi.hmef.dev;
exports org.apache.poi.hmef.extractor; exports org.apache.poi.hmef.extractor;

View File

@ -18,15 +18,19 @@ package org.apache.poi.ooxml.extractor;
import java.io.File; import java.io.File;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.extractor.POITextExtractor;
/** /**
* A command line wrapper around {@link ExtractorFactory}, useful * A command line wrapper around {@link ExtractorFactory}, useful
* for when debugging. * for when debugging.
*/ */
public class CommandLineTextExtractor { public final class CommandLineTextExtractor {
public static final String DIVIDER = "======================="; public static final String DIVIDER = "=======================";
private CommandLineTextExtractor() {
}
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
if (args.length < 1) { if (args.length < 1) {
System.err.println("Use:"); System.err.println("Use:");

View File

@ -1,384 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.ooxml.extractor;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.extractor.OLE2ExtractorFactory;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.NotImplemented;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.xmlbeans.XmlException;
/**
* Figures out the correct POITextExtractor for your supplied
* document, and returns it.
*
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
* not present on the runtime classpath</p>
* <p>Note 2 - rather than using this, for most cases you would be better
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
*/
@SuppressWarnings("WeakerAccess")
public final class ExtractorFactory {
private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
XSLFRelation.PRESENTATION_MACRO
};
private ExtractorFactory() {
}
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*/
public static boolean getThreadPrefersEventExtractors() {
return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
}
/**
* Should all threads prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is to use the thread level setting, which defaults to false.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
}
/**
* Should this thread prefer event based over usermodel based extractors?
* Will only be used if the All Threads setting is null.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
}
/**
* Should all threads prefer event based over usermodel based extractors?
* If set, will take preference over the Thread level setting.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
}
/**
* Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific.
*/
public static boolean getPreferEventExtractor() {
return OLE2ExtractorFactory.getPreferEventExtractor();
}
@SuppressWarnings("unchecked")
public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(f);
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
return (T)createEncryptedOOXMLExtractor(fs);
}
POITextExtractor extractor = createExtractor(fs);
extractor.setFilesystem(fs);
return (T)extractor;
} catch (OfficeXmlFileException e) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
T t = (T)createExtractor(pkg);
t.setFilesystem(pkg);
return t;
} catch (NotOLE2FileException ne) {
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne);
} catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR
// ensure file-handle release
IOUtils.closeQuietly(fs);
throw e;
}
}
public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException {
InputStream is = FileMagic.prepareToCheckMagic(inp);
FileMagic fm = FileMagic.valueOf(is);
switch (fm) {
case OLE2:
POIFSFileSystem fs = new POIFSFileSystem(is);
boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs);
case OOXML:
return createExtractor(OPCPackage.open(is));
default:
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm);
}
}
/**
* Tries to determine the actual type of file and produces a matching text-extractor for it.
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
* @throws XmlException If an XML parsing error occurs.
* @throws IllegalArgumentException If no matching file type could be found.
*/
public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) {
// Could it be an OOXML-Strict one?
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
}
if (core.size() == 0) {
// Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1)
return new XDGFVisioExtractor(pkg);
}
// Should just be a single core document, complain if not
if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
// Grab the core document part, and try to identify from that
final PackagePart corePart = pkg.getPart(core.getRelationship(0));
final String contentType = corePart.getContentType();
// Is it XSSF?
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
if (getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg);
}
return new XSSFExcelExtractor(pkg);
}
}
// Is it XWPF?
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
return new XWPFWordExtractor(pkg);
}
}
// Is it XSLF?
for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
if ( rel.getContentType().equals( contentType ) ) {
return new SlideShowExtractor<>(new XMLSlideShow(pkg));
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new SlideShowExtractor<>(new XMLSlideShow(pkg));
}
// How about xlsb?
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XSSFBEventBasedExcelExtractor(pkg);
}
}
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")");
} catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
// ensure that we close the package again if there is an error opening it, however
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
pkg.revert();
throw e;
}
}
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
return createExtractor(fs.getRoot());
}
@SuppressWarnings("unchecked")
public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
{
// First, check for OOXML
for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return (T)createExtractor(pkg);
}
}
// If not, ask the OLE2 code to check, with Scratchpad if possible
return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
}
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
// All the embedded directories we spotted
ArrayList<Entry> dirs = new ArrayList<>();
// For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<>();
// Find all the embedded directories
DirectoryEntry root = ext.getRoot();
if (root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
// provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it
if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root
Iterator<Entry> it = root.getEntries();
while (it.hasNext()) {
Entry entry = it.next();
if (entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else {
try {
Class<?> clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory");
Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
m.invoke(null, ext, dirs, nonPOIFS);
} catch (ReflectiveOperationException e) {
logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage());
return new POITextExtractor[0];
}
}
// Create the extractors
if (dirs.size() == 0 && nonPOIFS.size() == 0){
return new POITextExtractor[0];
}
ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
for (Entry dir : dirs) {
textExtractors.add(createExtractor((DirectoryNode) dir));
}
for (InputStream nonPOIF : nonPOIFS) {
try {
textExtractors.add(createExtractor(nonPOIF));
} catch (IllegalArgumentException e) {
// Ignore, just means it didn't contain
// a format we support as yet
logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
} catch (XmlException | OpenXML4JException e) {
throw new IOException(e.getMessage(), e);
}
}
return textExtractors.toArray(new POITextExtractor[0]);
}
/**
* Returns an array of text extractors, one for each of
* the embedded documents in the file (if there are any).
* If there are no embedded documents, you'll get back an
* empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file.
*/
@NotImplemented
@SuppressWarnings({"UnusedParameters", "UnusedReturnValue"})
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) {
throw new IllegalStateException("Not yet supported");
}
private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs)
throws IOException {
String pass = Biff8EncryptionKey.getCurrentUserPassword();
if (pass == null) {
pass = Decryptor.DEFAULT_PASSWORD;
}
EncryptionInfo ei = new EncryptionInfo(fs);
Decryptor dec = ei.getDecryptor();
InputStream is = null;
try {
if (!dec.verifyPassword(pass)) {
throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor");
}
is = dec.getDataStream(fs);
return createExtractor(OPCPackage.open(is));
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new EncryptedDocumentException(e);
} finally {
IOUtils.closeQuietly(is);
// also close the POIFSFileSystem here as we read all the data
// while decrypting
fs.close();
}
}
}

View File

@ -0,0 +1,281 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.ooxml.extractor;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.ExtractorProvider;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xslf.extractor.XSLFExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.xmlbeans.XmlException;
/**
* Figures out the correct POITextExtractor for your supplied
* document, and returns it.
*
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
* not present on the runtime classpath</p>
* <p>Note 2 - rather than using this, for most cases you would be better
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
*/
@SuppressWarnings("WeakerAccess")
public final class POIXMLExtractorFactory implements ExtractorProvider {
private static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
XSLFRelation.PRESENTATION_MACRO
};
@Override
public boolean accepts(FileMagic fm) {
return fm == FileMagic.OOXML;
}
/**
* Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is false.
*/
public static boolean getThreadPrefersEventExtractors() {
return ExtractorFactory.getThreadPrefersEventExtractors();
}
/**
* Should all threads prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory)
* Default is to use the thread level setting, which defaults to false.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
return ExtractorFactory.getAllThreadsPreferEventExtractors();
}
/**
* Should this thread prefer event based over usermodel based extractors?
* Will only be used if the All Threads setting is null.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
}
/**
* Should all threads prefer event based over usermodel based extractors?
* If set, will take preference over the Thread level setting.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
}
/**
* Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific.
*/
public static boolean getPreferEventExtractor() {
return ExtractorFactory.getPreferEventExtractor();
}
@Override
public POITextExtractor create(File f, String password) throws IOException {
if (FileMagic.valueOf(f) != FileMagic.OOXML) {
return ExtractorFactory.createExtractor(f, password);
}
OPCPackage pkg = null;
try {
pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
POIXMLTextExtractor ex = create(pkg);
if (ex == null) {
pkg.revert();
}
return ex;
} catch (InvalidFormatException ife) {
throw new IOException(ife);
} catch (IOException e) {
pkg.revert();
throw e;
}
}
public POITextExtractor create(InputStream inp, String password) throws IOException {
InputStream is = FileMagic.prepareToCheckMagic(inp);
if (FileMagic.valueOf(is) != FileMagic.OOXML) {
return ExtractorFactory.createExtractor(is, password);
}
OPCPackage pkg = null;
try {
pkg = OPCPackage.open(is);
POIXMLTextExtractor ex = create(pkg);
if (ex == null) {
pkg.revert();
}
return ex;
} catch (InvalidFormatException e) {
throw new IOException(e);
} catch (RuntimeException | IOException e) {
if (pkg != null) {
pkg.revert();
}
throw e;
}
}
/**
* Tries to determine the actual type of file and produces a matching text-extractor for it.
*
* @param pkg An {@link OPCPackage}.
* @return A {@link POIXMLTextExtractor} for the given file.
* @throws IOException If an error occurs while reading the file
* @throws IllegalArgumentException If no matching file type could be found.
*/
public POIXMLTextExtractor create(OPCPackage pkg) throws IOException {
try {
// Check for the normal Office core document
PackageRelationshipCollection core;
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
// If nothing was found, try some of the other OOXML-based core types
if (core.size() == 0) {
// Could it be an OOXML-Strict one?
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
}
if (core.size() == 0) {
// Could it be a visio one?
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
if (core.size() == 1) {
return new XDGFVisioExtractor(pkg);
}
}
// Should just be a single core document, complain if not
if (core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
// Grab the core document part, and try to identify from that
final PackagePart corePart = pkg.getPart(core.getRelationship(0));
final String contentType = corePart.getContentType();
// Is it XSSF?
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
if (getPreferEventExtractor()) {
return new XSSFEventBasedExcelExtractor(pkg);
}
return new XSSFExcelExtractor(pkg);
}
}
// Is it XWPF?
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XWPFWordExtractor(pkg);
}
}
// Is it XSLF?
for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XSLFExtractor(new XMLSlideShow(pkg));
}
}
// special handling for SlideShow-Theme-files,
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
return new XSLFExtractor(new XMLSlideShow(pkg));
}
// How about xlsb?
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
if (rel.getContentType().equals(contentType)) {
return new XSSFBEventBasedExcelExtractor(pkg);
}
}
return null;
} catch (IOException e) {
throw e;
} catch (Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
throw new IOException(e);
}
// we used to close (revert()) the package here, but this is the callers responsibility
// and we can't reuse the package
}
public POITextExtractor create(POIFSFileSystem fs) throws IOException {
return create(fs.getRoot(), Biff8EncryptionKey.getCurrentUserPassword());
}
@Override
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
// First, check for plain OOXML package
if (poifsDir.hasEntry("Package")) {
try (InputStream is = poifsDir.createDocumentInputStream("Package")) {
return create(is, password);
}
}
if (poifsDir.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
EncryptionInfo ei = new EncryptionInfo(poifsDir);
Decryptor dec = ei.getDecryptor();
try {
if (!dec.verifyPassword(password)) {
throw new IOException("Invalid password specified");
}
try (InputStream is = dec.getDataStream(poifsDir)) {
return create(is, password);
}
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new IOException(e);
}
}
throw new IOException("The OLE2 file neither contained a plain OOXML package node (\"Package\") nor an encrypted one (\"EncryptedPackage\").");
}
}

View File

@ -36,9 +36,10 @@ import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProper
* content of the OOXML file properties, eg author * content of the OOXML file properties, eg author
* and title. * and title.
*/ */
public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor { public class POIXMLPropertiesTextExtractor implements POIXMLTextExtractor {
private final POIXMLDocument doc;
private final DateFormat dateFormat; private final DateFormat dateFormat;
private boolean doCloseFilesystem = true;
/** /**
* Creates a new POIXMLPropertiesTextExtractor for the given open document. * Creates a new POIXMLPropertiesTextExtractor for the given open document.
@ -46,7 +47,7 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
* @param doc the given open document * @param doc the given open document
*/ */
public POIXMLPropertiesTextExtractor(POIXMLDocument doc) { public POIXMLPropertiesTextExtractor(POIXMLDocument doc) {
super(doc); this.doc = doc;
DateFormatSymbols dfs = DateFormatSymbols.getInstance(Locale.ROOT); DateFormatSymbols dfs = DateFormatSymbols.getInstance(Locale.ROOT);
dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", dfs); dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", dfs);
dateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC); dateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC);
@ -281,4 +282,24 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!"); throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
} }
@Override
public POIXMLDocument getDocument() {
return doc;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public POIXMLDocument getFilesystem() {
return null;
}
} }

View File

@ -27,42 +27,30 @@ import org.apache.poi.ooxml.POIXMLProperties.ExtendedProperties;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.util.ZipSecureFile; import org.apache.poi.openxml4j.util.ZipSecureFile;
public abstract class POIXMLTextExtractor extends POITextExtractor { public interface POIXMLTextExtractor extends POITextExtractor {
/** The POIXMLDocument that's open */
private final POIXMLDocument _document;
/**
* Creates a new text extractor for the given document
*
* @param document the document to extract from
*/
public POIXMLTextExtractor(POIXMLDocument document) {
_document = document;
}
/** /**
* Returns the core document properties * Returns the core document properties
* *
* @return the core document properties * @return the core document properties
*/ */
public CoreProperties getCoreProperties() { default CoreProperties getCoreProperties() {
return _document.getProperties().getCoreProperties(); return getDocument().getProperties().getCoreProperties();
} }
/** /**
* Returns the extended document properties * Returns the extended document properties
* *
* @return the extended document properties * @return the extended document properties
*/ */
public ExtendedProperties getExtendedProperties() { default ExtendedProperties getExtendedProperties() {
return _document.getProperties().getExtendedProperties(); return getDocument().getProperties().getExtendedProperties();
} }
/** /**
* Returns the custom document properties * Returns the custom document properties
* *
* @return the custom document properties * @return the custom document properties
*/ */
public CustomProperties getCustomProperties() { default CustomProperties getCustomProperties() {
return _document.getProperties().getCustomProperties(); return getDocument().getProperties().getCustomProperties();
} }
/** /**
@ -71,17 +59,16 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
* @return the opened document * @return the opened document
*/ */
@Override @Override
public final POIXMLDocument getDocument() { POIXMLDocument getDocument();
return _document;
}
/** /**
* Returns the opened OPCPackage that contains the document * Returns the opened OPCPackage that contains the document
* *
* @return the opened OPCPackage * @return the opened OPCPackage
*/ */
public OPCPackage getPackage() { default OPCPackage getPackage() {
return _document.getPackage(); POIXMLDocument doc = getDocument();
return doc != null ? doc.getPackage() : null;
} }
/** /**
@ -89,25 +76,24 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
* document properties metadata, such as title and author. * document properties metadata, such as title and author.
*/ */
@Override @Override
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() { default POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
return new POIXMLPropertiesTextExtractor(_document); return new POIXMLPropertiesTextExtractor(getDocument());
} }
@Override @Override
public void close() throws IOException { default void close() throws IOException {
// e.g. XSSFEventBaseExcelExtractor passes a null-document // e.g. XSSFEventBaseExcelExtractor passes a null-document
if(_document != null) { if (isCloseFilesystem()) {
@SuppressWarnings("resource") @SuppressWarnings("resource")
OPCPackage pkg = _document.getPackage(); OPCPackage pkg = getPackage();
if(pkg != null) { if (pkg != null) {
// revert the package to not re-write the file, which is very likely not wanted for a TextExtractor! // revert the package to not re-write the file, which is very likely not wanted for a TextExtractor!
pkg.revert(); pkg.revert();
} }
} }
super.close();
} }
protected void checkMaxTextSize(CharSequence text, String string) { default void checkMaxTextSize(CharSequence text, String string) {
if(string == null) { if(string == null) {
return; return;
} }

View File

@ -18,7 +18,6 @@ package org.apache.poi.xdgf.extractor;
import java.io.IOException; import java.io.IOException;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xdgf.usermodel.XDGFPage; import org.apache.poi.xdgf.usermodel.XDGFPage;
@ -28,12 +27,12 @@ import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor;
/** /**
* Helper class to extract text from an OOXML Visio File * Helper class to extract text from an OOXML Visio File
*/ */
public class XDGFVisioExtractor extends POIXMLTextExtractor { public class XDGFVisioExtractor implements POIXMLTextExtractor {
protected final XmlVisioDocument document; protected final XmlVisioDocument document;
private boolean doCloseFilesystem = true;
public XDGFVisioExtractor(XmlVisioDocument document) { public XDGFVisioExtractor(XmlVisioDocument document) {
super(document);
this.document = document; this.document = document;
} }
@ -51,17 +50,23 @@ public class XDGFVisioExtractor extends POIXMLTextExtractor {
return visitor.getText(); return visitor.getText();
} }
public static void main(String [] args) throws IOException { @Override
if (args.length < 1) { public XmlVisioDocument getDocument() {
System.err.println("Use:"); return document;
System.err.println(" XDGFVisioExtractor <filename.vsdx>"); }
System.exit(1);
} @Override
POIXMLTextExtractor extractor = public void setCloseFilesystem(boolean doCloseFilesystem) {
new XDGFVisioExtractor(POIXMLDocument.openPackage( this.doCloseFilesystem = doCloseFilesystem;
args[0] }
));
System.out.println(extractor.getText()); @Override
extractor.close(); public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public XmlVisioDocument getFilesystem() {
return document;
} }
} }

View File

@ -0,0 +1,45 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.xslf.extractor;
import org.apache.poi.ooxml.extractor.POIXMLPropertiesTextExtractor;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
/**
* Helper class to extract text from an OOXML Powerpoint file
*/
public class XSLFExtractor extends SlideShowExtractor<XSLFShape, XSLFTextParagraph> implements POIXMLTextExtractor {
public XSLFExtractor(XMLSlideShow slideshow) {
super(slideshow);
}
@Override
public XMLSlideShow getDocument() {
return (XMLSlideShow)slideshow;
}
@Override
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
return POIXMLTextExtractor.super.getMetadataTextExtractor();
}
}

View File

@ -19,7 +19,6 @@ package org.apache.poi.xssf.extractor;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.DataFormatter;
@ -43,8 +42,7 @@ import org.xml.sax.SAXException;
* *
* @since 3.16-beta3 * @since 3.16-beta3
*/ */
public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor {
implements org.apache.poi.ss.extractor.ExcelExtractor {
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFBEventBasedExcelExtractor.class); private static final POILogger LOGGER = POILogFactory.getLogger(XSSFBEventBasedExcelExtractor.class);
@ -62,18 +60,6 @@ public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor
super(container); super(container);
} }
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Use:");
System.err.println(" XSSFBEventBasedExcelExtractor <filename.xlsb>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new XSSFBEventBasedExcelExtractor(args[0]);
System.out.println(extractor.getText());
extractor.close();
}
public void setHandleHyperlinksInCells(boolean handleHyperlinksInCells) { public void setHandleHyperlinksInCells(boolean handleHyperlinksInCells) {
this.handleHyperlinksInCells = handleHyperlinksInCells; this.handleHyperlinksInCells = handleHyperlinksInCells;
} }

View File

@ -25,6 +25,7 @@ import java.util.Map;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.POIXMLProperties; import org.apache.poi.ooxml.POIXMLProperties;
import org.apache.poi.ooxml.POIXMLProperties.CoreProperties; import org.apache.poi.ooxml.POIXMLProperties.CoreProperties;
import org.apache.poi.ooxml.POIXMLProperties.CustomProperties; import org.apache.poi.ooxml.POIXMLProperties.CustomProperties;
@ -57,13 +58,13 @@ import org.xml.sax.XMLReader;
* Implementation of a text extractor from OOXML Excel * Implementation of a text extractor from OOXML Excel
* files that uses SAX event based parsing. * files that uses SAX event based parsing.
*/ */
public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor public class XSSFEventBasedExcelExtractor
implements org.apache.poi.ss.extractor.ExcelExtractor { implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class); private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class);
protected OPCPackage container; protected final OPCPackage container;
protected POIXMLProperties properties; protected final POIXMLProperties properties;
protected Locale locale; protected Locale locale;
protected boolean includeTextBoxes = true; protected boolean includeTextBoxes = true;
@ -73,29 +74,17 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
protected boolean formulasNotResults; protected boolean formulasNotResults;
protected boolean concatenatePhoneticRuns = true; protected boolean concatenatePhoneticRuns = true;
private boolean doCloseFilesystem = true;
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path)); this(OPCPackage.open(path));
} }
public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
super(null);
this.container = container; this.container = container;
properties = new POIXMLProperties(container); properties = new POIXMLProperties(container);
} }
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Use:");
System.err.println(" XSSFEventBasedExcelExtractor <filename.xlsx>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new XSSFEventBasedExcelExtractor(args[0]);
System.out.println(extractor.getText());
extractor.close();
}
/** /**
* Should sheet names be included? Default is true * Should sheet names be included? Default is true
*/ */
@ -319,12 +308,23 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
} }
@Override @Override
public void close() throws IOException { public POIXMLDocument getDocument() {
if (container != null) { return null;
container.close(); }
container = null;
} @Override
super.close(); public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public OPCPackage getFilesystem() {
return container;
} }
protected class SheetTextExtractor implements SheetContentsHandler { protected class SheetTextExtractor implements SheetContentsHandler {

View File

@ -20,8 +20,8 @@ import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import java.util.Locale; import java.util.Locale;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Cell;
@ -44,8 +44,8 @@ import org.apache.xmlbeans.XmlException;
/** /**
* Helper class to extract text from an OOXML Excel file * Helper class to extract text from an OOXML Excel file
*/ */
public class XSSFExcelExtractor extends POIXMLTextExtractor public class XSSFExcelExtractor
implements org.apache.poi.ss.extractor.ExcelExtractor { implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] { public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] {
XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK, XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK,
XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK, XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK,
@ -53,34 +53,21 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
}; };
private Locale locale; private Locale locale;
private XSSFWorkbook workbook; private final XSSFWorkbook workbook;
private boolean includeSheetNames = true; private boolean includeSheetNames = true;
private boolean formulasNotResults; private boolean formulasNotResults;
private boolean includeCellComments; private boolean includeCellComments;
private boolean includeHeadersFooters = true; private boolean includeHeadersFooters = true;
private boolean includeTextBoxes = true; private boolean includeTextBoxes = true;
private boolean doCloseFilesystem = true;
public XSSFExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { public XSSFExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
this(new XSSFWorkbook(container)); this(new XSSFWorkbook(container));
} }
public XSSFExcelExtractor(XSSFWorkbook workbook) { public XSSFExcelExtractor(XSSFWorkbook workbook) {
super(workbook);
this.workbook = workbook; this.workbook = workbook;
} }
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" XSSFExcelExtractor <filename.xlsx>");
System.exit(1);
}
try (OPCPackage pkg = OPCPackage.create(args[0]);
POIXMLTextExtractor extractor = new XSSFExcelExtractor(pkg)) {
System.out.println(extractor.getText());
}
}
/** /**
* Should sheet names be included? Default is true * Should sheet names be included? Default is true
*/ */
@ -262,4 +249,24 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
private String extractHeaderFooter(HeaderFooter hf) { private String extractHeaderFooter(HeaderFooter hf) {
return ExcelExtractor._extractHeaderFooter(hf); return ExcelExtractor._extractHeaderFooter(hf);
} }
@Override
public XSSFWorkbook getDocument() {
return workbook;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public XSSFWorkbook getFilesystem() {
return workbook;
}
} }

View File

@ -19,9 +19,7 @@ package org.apache.poi.xwpf.extractor;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor; import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
@ -39,46 +37,31 @@ import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
/** /**
* Helper class to extract text from an OOXML Word file * Helper class to extract text from an OOXML Word file
*/ */
public class XWPFWordExtractor extends POIXMLTextExtractor { public class XWPFWordExtractor implements POIXMLTextExtractor {
public static final XWPFRelation[] SUPPORTED_TYPES = { public static final XWPFRelation[] SUPPORTED_TYPES = {
XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE, XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE,
XWPFRelation.MACRO_DOCUMENT, XWPFRelation.MACRO_DOCUMENT,
XWPFRelation.MACRO_TEMPLATE_DOCUMENT XWPFRelation.MACRO_TEMPLATE_DOCUMENT
}; };
private XWPFDocument document; private final XWPFDocument document;
private boolean fetchHyperlinks; private boolean fetchHyperlinks;
private boolean concatenatePhoneticRuns = true; private boolean concatenatePhoneticRuns = true;
private boolean doCloseFilesystem = true;
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException { public XWPFWordExtractor(OPCPackage container) throws IOException {
this(new XWPFDocument(container)); this(new XWPFDocument(container));
} }
public XWPFWordExtractor(XWPFDocument document) { public XWPFWordExtractor(XWPFDocument document) {
super(document);
this.document = document; this.document = document;
} }
public static void main(String[] args) throws Exception {
if (args.length < 1) {
System.err.println("Use:");
System.err.println(" XWPFWordExtractor <filename.docx>");
System.exit(1);
}
POIXMLTextExtractor extractor =
new XWPFWordExtractor(POIXMLDocument.openPackage(
args[0]
));
System.out.println(extractor.getText());
extractor.close();
}
/** /**
* Should we also fetch the hyperlinks, when fetching * Should we also fetch the hyperlinks, when fetching
* the text content? Default is to only output the * the text content? Default is to only output the
@ -217,4 +200,24 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
text.append(hfPolicy.getDefaultHeader().getText()); text.append(hfPolicy.getDefaultHeader().getText());
} }
} }
@Override
public XWPFDocument getDocument() {
return document;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public XWPFDocument getFilesystem() {
return document;
}
} }

View File

@ -31,23 +31,25 @@ import java.util.Locale;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.UnsupportedFileFormatException; import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.OldExcelFormatException;
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor; import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess; import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.poifs.filesystem.FileMagic; import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.NotOLE2FileException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.ExpectedException;
/** /**
* Test that the extractor factory plays nicely * Test that the extractor factory plays nicely
@ -89,6 +91,8 @@ public class TestExtractorFactory {
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance(); private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
private static File pub = getFileAndCheck(pubTests, "Simple.pub"); private static File pub = getFileAndCheck(pubTests, "Simple.pub");
private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory();
private static File getFileAndCheck(POIDataSamples samples, String name) { private static File getFileAndCheck(POIDataSamples samples, String name) {
File file = samples.getFile(name); File file = samples.getFile(name);
@ -110,7 +114,7 @@ public class TestExtractorFactory {
"Word 6", doc6, "Word6Extractor", 20, "Word 6", doc6, "Word6Extractor", 20,
"Word 95", doc95, "Word6Extractor", 120, "Word 95", doc95, "Word6Extractor", 120,
"PowerPoint", ppt, "SlideShowExtractor", 120, "PowerPoint", ppt, "SlideShowExtractor", 120,
"PowerPoint - pptx", pptx, "SlideShowExtractor", 120, "PowerPoint - pptx", pptx, "XSLFExtractor", 120,
"Visio", vsd, "VisioTextExtractor", 50, "Visio", vsd, "VisioTextExtractor", 50,
"Visio - vsdx", vsdx, "XDGFVisioExtractor", 20, "Visio - vsdx", vsdx, "XDGFVisioExtractor", 20,
"Publisher", pub, "PublisherTextExtractor", 50, "Publisher", pub, "PublisherTextExtractor", 50,
@ -125,6 +129,8 @@ public class TestExtractorFactory {
R apply(T t) throws IOException, OpenXML4JException, XmlException; R apply(T t) throws IOException, OpenXML4JException, XmlException;
} }
@Rule
public ExpectedException thrown = ExpectedException.none();
@Test @Test
public void testFile() throws Exception { public void testFile() throws Exception {
@ -135,12 +141,12 @@ public class TestExtractorFactory {
} }
} }
@Test(expected = IllegalArgumentException.class) @Test
public void testFileInvalid() throws Exception { public void testFileInvalid() throws Exception {
thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
thrown.expect(IOException.class);
// Text // Text
try (POITextExtractor ignored = ExtractorFactory.createExtractor(txt)) { ExtractorFactory.createExtractor(txt);
fail("extracting from invalid package");
}
} }
@Test @Test
@ -148,8 +154,10 @@ public class TestExtractorFactory {
testStream(ExtractorFactory::createExtractor, true); testStream(ExtractorFactory::createExtractor, true);
} }
@Test(expected = IllegalArgumentException.class) @Test
public void testInputStreamInvalid() throws Exception { public void testInputStreamInvalid() throws Exception {
thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
thrown.expect(IOException.class);
testInvalid(ExtractorFactory::createExtractor); testInvalid(ExtractorFactory::createExtractor);
} }
@ -158,8 +166,10 @@ public class TestExtractorFactory {
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false); testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
} }
@Test(expected = IOException.class) @Test
public void testPOIFSInvalid() throws Exception { public void testPOIFSInvalid() throws Exception {
thrown.expectMessage("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0");
thrown.expect(NotOLE2FileException.class);
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f))); testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
} }
@ -195,9 +205,7 @@ public class TestExtractorFactory {
POITextExtractor ignored = poifs.apply(fis)) { POITextExtractor ignored = poifs.apply(fis)) {
fail("extracting from invalid package"); fail("extracting from invalid package");
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
assertTrue("Had: " + e, assertTrue("Had: " + e, e.getMessage().contains(FileMagic.UNKNOWN.name()));
e.getMessage().contains(FileMagic.UNKNOWN.name()));
throw e; throw e;
} }
} }
@ -211,7 +219,7 @@ public class TestExtractorFactory {
} }
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ); try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) { final POITextExtractor ext = xmlFactory.create(pkg)) {
testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]); testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
pkg.revert(); pkg.revert();
} }
@ -222,7 +230,7 @@ public class TestExtractorFactory {
public void testPackageInvalid() throws Exception { public void testPackageInvalid() throws Exception {
// Text // Text
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ); try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
final POITextExtractor ignored = ExtractorFactory.createExtractor(pkg)) { final POITextExtractor ignored = xmlFactory.create(pkg)) {
fail("extracting from invalid package"); fail("extracting from invalid package");
} }
} }
@ -251,61 +259,45 @@ public class TestExtractorFactory {
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors()); assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
try {
// Check we get the right extractors now
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
assertTrue(extractor instanceof EventBasedExcelExtractor);
}
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
assertTrue(extractor.getText().length() > 200);
}
// Check we get the right extractors now try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
assertTrue( }
extractor
instanceof EventBasedExcelExtractor
);
extractor.close();
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor); assertTrue(extractor.getText().length() > 200);
extractor.close(); }
} finally {
// Put back to normal
ExtractorFactory.setThreadPrefersEventExtractors(false);
}
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
// Put back to normal
ExtractorFactory.setThreadPrefersEventExtractors(false);
assertFalse(ExtractorFactory.getPreferEventExtractor()); assertFalse(ExtractorFactory.getPreferEventExtractor());
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors()); assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors()); assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
// And back // And back
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls))); try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
assertTrue( assertTrue(extractor instanceof ExcelExtractor);
extractor }
instanceof ExcelExtractor try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
); assertTrue(extractor.getText().length() > 200);
extractor.close(); }
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ)); try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
assertTrue( assertTrue(extractor instanceof XSSFExcelExtractor);
extractor }
instanceof XSSFExcelExtractor try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) {
); assertTrue(extractor.getText().length() > 200);
extractor.close(); }
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
assertTrue(
extractor.getText().length() > 200
);
extractor.close();
} }
/** /**
@ -325,7 +317,7 @@ public class TestExtractorFactory {
}; };
for (int i=0; i<testObj.length; i+=3) { for (int i=0; i<testObj.length; i+=3) {
try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) { try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor)ExtractorFactory.createExtractor((File)testObj[i+1])) {
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext); final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0; int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
@ -463,16 +455,17 @@ public class TestExtractorFactory {
* #59074 - Excel 95 files should give a helpful message, not just * #59074 - Excel 95 files should give a helpful message, not just
* "No supported documents found in the OLE2 stream" * "No supported documents found in the OLE2 stream"
*/ */
@Test(expected = OldExcelFormatException.class)
public void bug59074() throws Exception { public void bug59074() throws Exception {
ExtractorFactory.createExtractor( try (POITextExtractor extractor = ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"))) {
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls")); String text = extractor.getText();
assertContains(text, "testdoc");
}
} }
@Test(expected = IllegalStateException.class) @Test(expected = IllegalStateException.class)
public void testGetEmbeddedFromXMLExtractor() { public void testGetEmbeddedFromXMLExtractor() throws IOException {
// currently not implemented // currently not implemented
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null); ExtractorFactory.getEmbeddedDocsTextExtractors(null);
} }
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed. // This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.

View File

@ -60,9 +60,9 @@ import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.POITestCase; import org.apache.poi.POITestCase;
import org.apache.poi.UnsupportedFileFormatException; import org.apache.poi.UnsupportedFileFormatException;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.ooxml.POIXMLException; import org.apache.poi.ooxml.POIXMLException;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.ooxml.util.DocumentHelper; import org.apache.poi.ooxml.util.DocumentHelper;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidOperationException; import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@ -836,7 +836,7 @@ public final class TestPackage {
@Test @Test
public void testZipEntityExpansionExceedsMemory() throws IOException, OpenXML4JException, XmlException { public void testZipEntityExpansionExceedsMemory() throws IOException, OpenXML4JException, XmlException {
expectedEx.expect(POIXMLException.class); expectedEx.expect(IOException.class);
expectedEx.expectMessage("unable to parse shared strings table"); expectedEx.expectMessage("unable to parse shared strings table");
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than")); expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than"));
openXmlBombFile("poc-xmlbomb.xlsx"); openXmlBombFile("poc-xmlbomb.xlsx");
@ -844,7 +844,7 @@ public final class TestPackage {
@Test @Test
public void testZipEntityExpansionExceedsMemory2() throws IOException, OpenXML4JException, XmlException { public void testZipEntityExpansionExceedsMemory2() throws IOException, OpenXML4JException, XmlException {
expectedEx.expect(POIXMLException.class); expectedEx.expect(IOException.class);
expectedEx.expectMessage("unable to parse shared strings table"); expectedEx.expectMessage("unable to parse shared strings table");
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than")); expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than"));
openXmlBombFile("poc-xmlbomb-empty.xlsx"); openXmlBombFile("poc-xmlbomb-empty.xlsx");

View File

@ -35,14 +35,12 @@ import java.util.Collection;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.POIDocument; import org.apache.poi.POIDocument;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.poifs.crypt.EncryptionInfo; import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.crypt.cryptoapi.CryptoAPIEncryptionHeader; import org.apache.poi.poifs.crypt.cryptoapi.CryptoAPIEncryptionHeader;
import org.apache.poi.poifs.storage.RawDataUtil; import org.apache.poi.poifs.storage.RawDataUtil;
import org.apache.xmlbeans.XmlException;
import org.junit.Test; import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
import org.junit.runners.Parameterized; import org.junit.runners.Parameterized;
@ -91,7 +89,7 @@ public class TestHxxFEncryption {
} }
@Test @Test
public void extract() throws IOException, OpenXML4JException, XmlException { public void extract() throws IOException {
File f = sampleDir.getFile(file); File f = sampleDir.getFile(file);
Biff8EncryptionKey.setCurrentUserPassword(password); Biff8EncryptionKey.setCurrentUserPassword(password);
try (POITextExtractor te = ExtractorFactory.createExtractor(f)) { try (POITextExtractor te = ExtractorFactory.createExtractor(f)) {
@ -103,16 +101,16 @@ public class TestHxxFEncryption {
} }
@Test @Test
public void changePassword() throws IOException, OpenXML4JException, XmlException { public void changePassword() throws IOException {
newPassword("test"); newPassword("test");
} }
@Test @Test
public void removePassword() throws IOException, OpenXML4JException, XmlException { public void removePassword() throws IOException {
newPassword(null); newPassword(null);
} }
private void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException { private void newPassword(String newPass) throws IOException {
File f = sampleDir.getFile(file); File f = sampleDir.getFile(file);
Biff8EncryptionKey.setCurrentUserPassword(password); Biff8EncryptionKey.setCurrentUserPassword(password);
try (POITextExtractor te1 = ExtractorFactory.createExtractor(f)) { try (POITextExtractor te1 = ExtractorFactory.createExtractor(f)) {
@ -133,7 +131,7 @@ public class TestHxxFEncryption {
/** changing the encryption mode and key size in poor mans style - see comments below */ /** changing the encryption mode and key size in poor mans style - see comments below */
@Test @Test
public void changeEncryption() throws IOException, OpenXML4JException, XmlException { public void changeEncryption() throws IOException {
File f = sampleDir.getFile(file); File f = sampleDir.getFile(file);
ByteArrayOutputStream bos = new ByteArrayOutputStream(); ByteArrayOutputStream bos = new ByteArrayOutputStream();
Biff8EncryptionKey.setCurrentUserPassword(password); Biff8EncryptionKey.setCurrentUserPassword(password);
@ -157,7 +155,7 @@ public class TestHxxFEncryption {
POIDocument doc = (POIDocument) te3.getDocument()) { POIDocument doc = (POIDocument) te3.getDocument()) {
// need to cache data (i.e. read all data) before changing the key size // need to cache data (i.e. read all data) before changing the key size
Class<?> clazz = doc.getClass(); Class<?> clazz = doc.getClass();
if ("HSLFSlideShowImpl".equals(clazz.getSimpleName())) { if ("HSLFSlideShow".equals(clazz.getSimpleName())) {
try { try {
clazz.getDeclaredMethod("getPictureData").invoke(doc); clazz.getDeclaredMethod("getPictureData").invoke(doc);
} catch (ReflectiveOperationException e) { } catch (ReflectiveOperationException e) {

View File

@ -522,7 +522,7 @@ public class TestXSLFBugs {
private String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException { private String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
try (SlideShowExtractor<XSLFShape,XSLFTextParagraph> extr = new SlideShowExtractor<>(ppt)) { try (SlideShowExtractor<XSLFShape,XSLFTextParagraph> extr = new SlideShowExtractor<>(ppt)) {
// do not auto-close the slideshow // do not auto-close the slideshow
extr.setFilesystem(null); extr.setCloseFilesystem(false);
extr.setSlidesByDefault(true); extr.setSlidesByDefault(true);
extr.setNotesByDefault(false); extr.setNotesByDefault(false);
extr.setMasterByDefault(false); extr.setMasterByDefault(false);

View File

@ -29,20 +29,18 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape; import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFTextParagraph; import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
import org.apache.xmlbeans.XmlException;
import org.junit.Test; import org.junit.Test;
/** /**
* Tests for XSLFPowerPointExtractor * Tests for XSLFPowerPointExtractor
*/ */
public class TestXSLFPowerPointExtractor { public class TestXSLFPowerPointExtractor {
private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance(); private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
/** /**
* Get text out of the simple file * Get text out of the simple file
@ -262,10 +260,11 @@ public class TestXSLFPowerPointExtractor {
} }
@Test @Test
public void test45541() throws IOException, OpenXML4JException, XmlException { public void test45541() throws IOException {
// extract text from a powerpoint that has a header in the notes-element // extract text from a powerpoint that has a header in the notes-element
final File headerFile = slTests.getFile("45541_Header.pptx"); final File headerFile = slTests.getFile("45541_Header.pptx");
try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) { //noinspection rawtypes
try (final SlideShowExtractor extr = (SlideShowExtractor) ExtractorFactory.createExtractor(headerFile)) {
String text = extr.getText(); String text = extr.getText();
assertNotNull(text); assertNotNull(text);
assertFalse("Had: " + text, text.contains("testdoc")); assertFalse("Had: " + text, text.contains("testdoc"));
@ -280,7 +279,8 @@ public class TestXSLFPowerPointExtractor {
// extract text from a powerpoint that has a footer in the master-slide // extract text from a powerpoint that has a footer in the master-slide
final File footerFile = slTests.getFile("45541_Footer.pptx"); final File footerFile = slTests.getFile("45541_Footer.pptx");
try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) { //noinspection rawtypes
try (SlideShowExtractor extr = (SlideShowExtractor)ExtractorFactory.createExtractor(footerFile)) {
String text = extr.getText(); String text = extr.getText();
assertNotContained(text, "testdoc"); assertNotContained(text, "testdoc");

View File

@ -16,7 +16,7 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.xssf.extractor; package org.apache.poi.xssf.extractor;
import org.apache.poi.ooxml.extractor.ExtractorFactory; import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
import org.junit.After; import org.junit.After;

View File

@ -17,8 +17,8 @@
package org.apache.poi.xssf.extractor; package org.apache.poi.xssf.extractor;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.junit.After; import org.junit.After;
/** /**

View File

@ -0,0 +1,18 @@
# ====================================================================
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
org.apache.poi.extractor.MainExtractorFactory

View File

@ -0,0 +1,18 @@
# ====================================================================
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
org.apache.poi.ooxml.extractor.POIXMLExtractorFactory

View File

@ -0,0 +1,18 @@
# ====================================================================
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory

View File

@ -17,44 +17,66 @@
package org.apache.poi.extractor.ole2; package org.apache.poi.extractor.ole2;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.stream.StreamSupport;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.ExtractorProvider;
import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.extractor.POITextExtractor; import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.extractor.OLE2ExtractorFactory;
import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlideShow; import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
import org.apache.poi.hsmf.MAPIMessage; import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks; import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.extractor.OutlookTextExtractor; import org.apache.poi.hsmf.extractor.OutlookTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor; import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor; import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShowFactory; import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger; import org.apache.poi.util.POILogger;
/** /**
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and * Scratchpad-specific logic for {@link ExtractorFactory} and
* {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with * {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with
* no Scratchpad jar (though without functionality!) * no Scratchpad jar (though without functionality!)
* <p>Note - should not be used standalone, always use via the other * <p>Note - should not be used standalone, always use via the other
* two classes</p> * two classes</p>
*/ */
@SuppressWarnings("WeakerAccess") @SuppressWarnings("WeakerAccess")
public class OLE2ScratchpadExtractorFactory { public class OLE2ScratchpadExtractorFactory implements ExtractorProvider {
private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class); private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class);
@Override
public boolean accepts(FileMagic fm) {
return FileMagic.OLE2 == fm;
}
@Override
public POITextExtractor create(File file, String password) throws IOException {
return create(new POIFSFileSystem(file, true).getRoot(), password);
}
@Override
public POITextExtractor create(InputStream inputStream, String password) throws IOException {
return create(new POIFSFileSystem(inputStream).getRoot(), password);
}
/** /**
* Look for certain entries in the stream, to figure it * Look for certain entries in the stream, to figure it
* out what format is desired * out what format is desired
@ -66,48 +88,54 @@ public class OLE2ScratchpadExtractorFactory {
* *
* @throws IOException when the format specific extraction fails because of invalid entires * @throws IOException when the format specific extraction fails because of invalid entires
*/ */
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException { public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
if (poifsDir.hasEntry("WordDocument")) { final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
// Old or new style word document? try {
try { Biff8EncryptionKey.setCurrentUserPassword(password);
return new WordExtractor(poifsDir); if (poifsDir.hasEntry("WordDocument")) {
} catch (OldWordFileFormatException e) { // Old or new style word document?
return new Word6Extractor(poifsDir); try {
return new WordExtractor(poifsDir);
} catch (OldWordFileFormatException e) {
return new Word6Extractor(poifsDir);
}
} }
}
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) { if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
return new SlideShowExtractor(SlideShowFactory.create(poifsDir)); return new SlideShowExtractor<HSLFShape, HSLFTextParagraph>(SlideShowFactory.create(poifsDir));
}
if (poifsDir.hasEntry("VisioDocument")) {
return new VisioTextExtractor(poifsDir);
}
if (poifsDir.hasEntry("Quill")) {
return new PublisherTextExtractor(poifsDir);
}
final String[] outlookEntryNames = new String[] {
// message bodies, saved as plain text (PtypString)
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
// @see org.apache.poi.hsmf.Types.MAPIType
"__substg1.0_1000001E", //PidTagBody ASCII
"__substg1.0_1000001F", //PidTagBody Unicode
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
"__substg1.0_0037001E", //PidTagSubject ASCII
"__substg1.0_0037001F", //PidTagSubject Unicode
};
for (String entryName : outlookEntryNames) {
if (poifsDir.hasEntry(entryName)) {
return new OutlookTextExtractor(poifsDir);
} }
if (poifsDir.hasEntry("VisioDocument")) {
return new VisioTextExtractor(poifsDir);
}
if (poifsDir.hasEntry("Quill")) {
return new PublisherTextExtractor(poifsDir);
}
final String[] outlookEntryNames = new String[]{
// message bodies, saved as plain text (PtypString)
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
// @see org.apache.poi.hsmf.Types.MAPIType
"__substg1.0_1000001E", //PidTagBody ASCII
"__substg1.0_1000001F", //PidTagBody Unicode
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
"__substg1.0_0037001E", //PidTagSubject ASCII
"__substg1.0_0037001F", //PidTagSubject Unicode
};
for (String entryName : outlookEntryNames) {
if (poifsDir.hasEntry(entryName)) {
return new OutlookTextExtractor(poifsDir);
}
}
} finally {
Biff8EncryptionKey.setCurrentUserPassword(oldPW);
} }
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); return null;
} }
/** /**
@ -120,10 +148,9 @@ public class OLE2ScratchpadExtractorFactory {
* @param ext the extractor holding the directory to start parsing * @param ext the extractor holding the directory to start parsing
* @param dirs a list to be filled with directory references holding embedded * @param dirs a list to be filled with directory references holding embedded
* @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries * @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
*
* @throws IOException when the format specific extraction fails because of invalid entires
*/ */
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException { @Override
public void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) {
// Find all the embedded directories // Find all the embedded directories
DirectoryEntry root = ext.getRoot(); DirectoryEntry root = ext.getRoot();
if (root == null) { if (root == null) {
@ -132,25 +159,16 @@ public class OLE2ScratchpadExtractorFactory {
if (ext instanceof ExcelExtractor) { if (ext instanceof ExcelExtractor) {
// These are in MBD... under the root // These are in MBD... under the root
Iterator<Entry> it = root.getEntries(); StreamSupport.stream(root.spliterator(), false)
while (it.hasNext()) { .filter(entry -> entry.getName().startsWith("MBD"))
Entry entry = it.next(); .forEach(dirs::add);
if (entry.getName().startsWith("MBD")) {
dirs.add(entry);
}
}
} else if (ext instanceof WordExtractor) { } else if (ext instanceof WordExtractor) {
// These are in ObjectPool -> _... under the root // These are in ObjectPool -> _... under the root
try { try {
DirectoryEntry op = (DirectoryEntry) DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
root.getEntry("ObjectPool"); StreamSupport.stream(op.spliterator(), false)
Iterator<Entry> it = op.getEntries(); .filter(entry -> entry.getName().startsWith("_"))
while(it.hasNext()) { .forEach(dirs::add);
Entry entry = it.next();
if(entry.getName().startsWith("_")) {
dirs.add(entry);
}
}
} catch(FileNotFoundException e) { } catch(FileNotFoundException e) {
logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage()); logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage());
// ignored here // ignored here

View File

@ -17,7 +17,6 @@
package org.apache.poi.hdgf.extractor; package org.apache.poi.hdgf.extractor;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
@ -38,11 +37,11 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* Can operate on the command line (outputs to stdout), or * Can operate on the command line (outputs to stdout), or
* can return the text for you (example: for use with Lucene). * can return the text for you (example: for use with Lucene).
*/ */
public final class VisioTextExtractor extends POIOLE2TextExtractor { public final class VisioTextExtractor implements POIOLE2TextExtractor {
private HDGFDiagram hdgf; private HDGFDiagram hdgf;
private boolean doCloseFilesystem = true;
public VisioTextExtractor(HDGFDiagram hdgf) { public VisioTextExtractor(HDGFDiagram hdgf) {
super(hdgf);
this.hdgf = hdgf; this.hdgf = hdgf;
} }
public VisioTextExtractor(POIFSFileSystem fs) throws IOException { public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
@ -91,9 +90,7 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor {
// Capture the text, as long as it isn't // Capture the text, as long as it isn't
// simply an empty string // simply an empty string
String str = cmd.getValue().toString(); String str = cmd.getValue().toString();
if(str.isEmpty() || "\n".equals(str)) { if (!(str.isEmpty() || "\n".equals(str))) {
// Ignore empty strings
} else {
text.add( str ); text.add( str );
} }
} }
@ -121,21 +118,23 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor {
return text.toString(); return text.toString();
} }
public static void main(String[] args) throws Exception { @Override
if(args.length == 0) { public HDGFDiagram getDocument() {
System.err.println("Use:"); return hdgf;
System.err.println(" VisioTextExtractor <file.vsd>"); }
System.exit(1);
}
try (FileInputStream fis = new FileInputStream(args[0])) { @Override
VisioTextExtractor extractor = public void setCloseFilesystem(boolean doCloseFilesystem) {
new VisioTextExtractor(fis); this.doCloseFilesystem = doCloseFilesystem;
}
// Print not PrintLn as already has \n added to it @Override
System.out.print(extractor.getText()); public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
extractor.close(); @Override
} public HDGFDiagram getFilesystem() {
return hdgf;
} }
} }

View File

@ -17,35 +17,37 @@
package org.apache.poi.hpbf.extractor; package org.apache.poi.hpbf.extractor;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.extractor.POIOLE2TextExtractor; import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.hpbf.HPBFDocument; import org.apache.poi.hpbf.HPBFDocument;
import org.apache.poi.hpbf.model.qcbits.QCBit; import org.apache.poi.hpbf.model.qcbits.QCBit;
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12; import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/** /**
* Extract text from HPBF Publisher files * Extract text from HPBF Publisher files
*/ */
public final class PublisherTextExtractor extends POIOLE2TextExtractor { public final class PublisherTextExtractor implements POIOLE2TextExtractor {
private HPBFDocument doc; private final HPBFDocument doc;
private boolean hyperlinksByDefault; private boolean hyperlinksByDefault;
private boolean doCloseFilesystem = true;
public PublisherTextExtractor(HPBFDocument doc) { public PublisherTextExtractor(HPBFDocument doc) {
super(doc);
this.doc = doc; this.doc = doc;
} }
public PublisherTextExtractor(DirectoryNode dir) throws IOException { public PublisherTextExtractor(DirectoryNode dir) throws IOException {
this(new HPBFDocument(dir)); this(new HPBFDocument(dir));
} }
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException { public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs)); this(new HPBFDocument(fs));
} }
public PublisherTextExtractor(InputStream is) throws IOException { public PublisherTextExtractor(InputStream is) throws IOException {
this(new POIFSFileSystem(is)); this(new POIFSFileSystem(is));
} }
@ -66,7 +68,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
// Get the text from the Quill Contents // Get the text from the Quill Contents
QCBit[] bits = doc.getQuillContents().getBits(); QCBit[] bits = doc.getQuillContents().getBits();
for (QCBit bit1 : bits) { for (QCBit bit1 : bits) {
if (bit1 != null && bit1 instanceof QCTextBit) { if (bit1 instanceof QCTextBit) {
QCTextBit t = (QCTextBit) bit1; QCTextBit t = (QCTextBit) bit1;
text.append(t.getText().replace('\r', '\n')); text.append(t.getText().replace('\r', '\n'));
} }
@ -79,7 +81,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
// how to tie that together. // how to tie that together.
if(hyperlinksByDefault) { if(hyperlinksByDefault) {
for (QCBit bit : bits) { for (QCBit bit : bits) {
if (bit != null && bit instanceof Type12) { if (bit instanceof Type12) {
Type12 hyperlinks = (Type12) bit; Type12 hyperlinks = (Type12) bit;
for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) { for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) {
text.append("<"); text.append("<");
@ -96,19 +98,23 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
return text.toString(); return text.toString();
} }
@Override
public HPBFDocument getDocument() {
return doc;
}
public static void main(String[] args) throws Exception { @Override
if(args.length == 0) { public void setCloseFilesystem(boolean doCloseFilesystem) {
System.err.println("Use:"); this.doCloseFilesystem = doCloseFilesystem;
System.err.println(" PublisherTextExtractor <file.pub>"); }
}
for (String arg : args) { @Override
try (FileInputStream fis = new FileInputStream(arg)) { public boolean isCloseFilesystem() {
PublisherTextExtractor te = new PublisherTextExtractor(fis); return doCloseFilesystem;
System.out.println(te.getText()); }
te.close();
} @Override
} public HPBFDocument getFilesystem() {
return doc;
} }
} }

View File

@ -1,279 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hslf.extractor;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.extractor.POIOLE2TextExtractor;
import org.apache.poi.hslf.usermodel.HSLFObjectShape;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.sl.usermodel.SlideShowFactory;
import org.apache.poi.util.Removal;
/**
* This class can be used to extract text from a PowerPoint file. Can optionally
* also get the notes from one.
*
* @deprecated in POI 4.0.0, use {@link SlideShowExtractor} instead
*/
@SuppressWarnings("WeakerAccess")
@Deprecated
@Removal(version="5.0.0")
public final class PowerPointExtractor extends POIOLE2TextExtractor {
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
private boolean slidesByDefault = true;
private boolean notesByDefault;
private boolean commentsByDefault;
private boolean masterByDefault;
/**
* Basic extractor. Returns all the text, and optionally all the notes
*/
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.err.println("Usage:");
System.err.println("\tPowerPointExtractor [-notes] <file>");
System.exit(1);
}
boolean notes = false;
boolean comments = false;
boolean master = true;
String file;
if (args.length > 1) {
notes = true;
file = args[1];
if (args.length > 2) {
comments = true;
}
} else {
file = args[0];
}
try (PowerPointExtractor ppe = new PowerPointExtractor(file)) {
System.out.println(ppe.getText(true, notes, comments, master));
}
}
public PowerPointExtractor(final HSLFSlideShow slideShow) {
super(slideShow.getSlideShowImpl());
setFilesystem(slideShow);
delegate = new SlideShowExtractor<>(slideShow);
}
/**
* Creates a PowerPointExtractor, from a file
*
* @param fileName The name of the file to extract from
*/
public PowerPointExtractor(String fileName) throws IOException {
this(createHSLF(new File(fileName), Biff8EncryptionKey.getCurrentUserPassword(), true));
}
/**
* Creates a PowerPointExtractor, from an Input Stream
*
* @param iStream The input stream containing the PowerPoint document
*/
public PowerPointExtractor(InputStream iStream) throws IOException {
this(createHSLF(iStream, Biff8EncryptionKey.getCurrentUserPassword()));
}
/**
* Creates a PowerPointExtractor, from an open POIFSFileSystem
*
* @param fs the POIFSFileSystem containing the PowerPoint document
*/
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
this(createHSLF(fs, Biff8EncryptionKey.getCurrentUserPassword()));
}
/**
* Creates a PowerPointExtractor, from a specific place
* inside an open {@link POIFSFileSystem}
*
* @param dir the POIFS Directory containing the PowerPoint document
*/
public PowerPointExtractor(DirectoryNode dir) throws IOException {
this(new HSLFSlideShow(dir));
}
/**
* Creates a PowerPointExtractor, from a HSLFSlideShow
*
* @param ss the HSLFSlideShow to extract text from
*/
public PowerPointExtractor(HSLFSlideShowImpl ss) {
this(new HSLFSlideShow(ss));
}
/**
* Should a call to getText() return slide text? Default is yes
*/
public void setSlidesByDefault(final boolean slidesByDefault) {
this.slidesByDefault = slidesByDefault;
delegate.setSlidesByDefault(slidesByDefault);
}
/**
* Should a call to getText() return notes text? Default is no
*/
public void setNotesByDefault(final boolean notesByDefault) {
this.notesByDefault = notesByDefault;
delegate.setNotesByDefault(notesByDefault);
}
/**
* Should a call to getText() return comments text? Default is no
*/
public void setCommentsByDefault(final boolean commentsByDefault) {
this.commentsByDefault = commentsByDefault;
delegate.setCommentsByDefault(commentsByDefault);
}
/**
* Should a call to getText() return text from master? Default is no
*/
public void setMasterByDefault(final boolean masterByDefault) {
this.masterByDefault = masterByDefault;
delegate.setMasterByDefault(masterByDefault);
}
/**
* Fetches all the slide text from the slideshow, but not the notes, unless
* you've called setSlidesByDefault() and setNotesByDefault() to change this
*/
@Override
public String getText() {
return delegate.getText();
}
/**
* Fetches text from the slideshow, be it slide text or note text. Because
* the final block of text in a TextRun normally have their last \n
* stripped, we add it back
*
* @param getSlideText fetch slide text
* @param getNoteText fetch note text
*/
public String getText(boolean getSlideText, boolean getNoteText) {
return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault);
}
public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) {
delegate.setSlidesByDefault(getSlideText);
delegate.setNotesByDefault(getNoteText);
delegate.setCommentsByDefault(getCommentText);
delegate.setMasterByDefault(getMasterText);
try {
return delegate.getText();
} finally {
delegate.setSlidesByDefault(slidesByDefault);
delegate.setNotesByDefault(notesByDefault);
delegate.setCommentsByDefault(commentsByDefault);
delegate.setMasterByDefault(masterByDefault);
}
}
/**
* Fetches all the notes text from the slideshow, but not the slide text
*/
public String getNotes() {
return getText(false, true, false, false);
}
@SuppressWarnings("unchecked")
public List<HSLFObjectShape> getOLEShapes() {
return (List<HSLFObjectShape>)delegate.getOLEShapes();
}
/**
* Helper method to avoid problems with compiling code in Eclipse
*
* Eclipse javac has some bugs with complex casts, this method tries
* to work around this.
*
* @param fs The {@link POIFSFileSystem} to read the document from
* @param password The password that should be used or null if no password is necessary.
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
*/
private static HSLFSlideShow createHSLF(POIFSFileSystem fs, String password) throws IOException, EncryptedDocumentException {
// Note: don't change the code here, it is required for Eclipse to compile the code
SlideShow slideShowOrig = SlideShowFactory.create(fs, password);
return (HSLFSlideShow)slideShowOrig;
}
/**
* Helper method to avoid problems with compiling code in Eclipse
*
* Eclipse javac has some bugs with complex casts, this method tries
* to work around this.
*
* @param inp The {@link InputStream} to read data from.
* @param password The password that should be used or null if no password is necessary.
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
* @throws EncryptedDocumentException If the wrong password is given for a protected file
*/
private static HSLFSlideShow createHSLF(InputStream inp, String password) throws IOException, EncryptedDocumentException {
// Note: don't change the code here, it is required for Eclipse to compile the code
SlideShow slideShowOrig = SlideShowFactory.create(inp, password);
return (HSLFSlideShow)slideShowOrig;
}
/**
* Helper method to avoid problems with compiling code in Eclipse
*
* Eclipse javac has some bugs with complex casts, this method tries
* to work around this.
*
* @param file The file to read data from.
* @param password The password that should be used or null if no password is necessary.
* @param readOnly If the SlideShow should be opened in read-only mode to avoid writing back
* changes when the document is closed.
*
* @return The created SlideShow
*
* @throws IOException if an error occurs while reading the data
* @throws EncryptedDocumentException If the wrong password is given for a protected file
*/
private static HSLFSlideShow createHSLF(File file, String password, boolean readOnly) throws IOException, EncryptedDocumentException {
// Note: don't change the code here, it is required for Eclipse to compile the code
SlideShow slideShowOrig = SlideShowFactory.create(file, password, readOnly);
return (HSLFSlideShow)slideShowOrig;
}
}

View File

@ -33,6 +33,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.function.Supplier; import java.util.function.Supplier;
import org.apache.poi.POIDocument;
import org.apache.poi.common.usermodel.GenericRecord; import org.apache.poi.common.usermodel.GenericRecord;
import org.apache.poi.common.usermodel.fonts.FontInfo; import org.apache.poi.common.usermodel.fonts.FontInfo;
import org.apache.poi.ddf.EscherBSERecord; import org.apache.poi.ddf.EscherBSERecord;
@ -40,6 +41,9 @@ import org.apache.poi.ddf.EscherContainerRecord;
import org.apache.poi.ddf.EscherOptRecord; import org.apache.poi.ddf.EscherOptRecord;
import org.apache.poi.hpsf.ClassID; import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hpsf.ClassIDPredefined; import org.apache.poi.hpsf.ClassIDPredefined;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException;
import org.apache.poi.hslf.exceptions.HSLFException; import org.apache.poi.hslf.exceptions.HSLFException;
@ -47,6 +51,7 @@ import org.apache.poi.hslf.model.HeadersFooters;
import org.apache.poi.hslf.model.MovieShape; import org.apache.poi.hslf.model.MovieShape;
import org.apache.poi.hslf.record.*; import org.apache.poi.hslf.record.*;
import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet; import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -66,7 +71,7 @@ import org.apache.poi.util.Units;
* TODO: - figure out how to match notes to their correct sheet (will involve * TODO: - figure out how to match notes to their correct sheet (will involve
* understanding DocSlideList and DocNotesList) - handle Slide creation cleaner * understanding DocSlideList and DocNotesList) - handle Slide creation cleaner
*/ */
public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord { public final class HSLFSlideShow extends POIDocument implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord {
//arbitrarily selected; may need to increase //arbitrarily selected; may need to increase
private static final int MAX_RECORD_LENGTH = 10_000_000; private static final int MAX_RECORD_LENGTH = 10_000_000;
@ -111,6 +116,8 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
* @param hslfSlideShow the HSLFSlideShow to base on * @param hslfSlideShow the HSLFSlideShow to base on
*/ */
public HSLFSlideShow(HSLFSlideShowImpl hslfSlideShow) { public HSLFSlideShow(HSLFSlideShowImpl hslfSlideShow) {
super(hslfSlideShow.getDirectory());
loadSavePhase.set(LoadSavePhase.INIT); loadSavePhase.set(LoadSavePhase.INIT);
// Get useful things from our base slideshow // Get useful things from our base slideshow
@ -1179,4 +1186,94 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
public List<? extends GenericRecord> getGenericChildren() { public List<? extends GenericRecord> getGenericChildren() {
return Arrays.asList(_hslfSlideShow.getRecords()); return Arrays.asList(_hslfSlideShow.getRecords());
} }
@Override
public void write() throws IOException {
getSlideShowImpl().write();
}
@Override
public void write(File newFile) throws IOException {
getSlideShowImpl().write(newFile);
}
@Override
public DocumentSummaryInformation getDocumentSummaryInformation() {
return getSlideShowImpl().getDocumentSummaryInformation();
}
@Override
public SummaryInformation getSummaryInformation() {
return getSlideShowImpl().getSummaryInformation();
}
@Override
public void createInformationProperties() {
getSlideShowImpl().createInformationProperties();
}
@Override
public void readProperties() {
getSlideShowImpl().readProperties();
}
@Override
protected PropertySet getPropertySet(String setName) throws IOException {
return getSlideShowImpl().getPropertySetImpl(setName);
}
@Override
protected PropertySet getPropertySet(String setName, EncryptionInfo encryptionInfo) throws IOException {
return getSlideShowImpl().getPropertySetImpl(setName, encryptionInfo);
}
@Override
protected void writeProperties() throws IOException {
getSlideShowImpl().writePropertiesImpl();
}
@Override
public void writeProperties(POIFSFileSystem outFS) throws IOException {
getSlideShowImpl().writeProperties(outFS);
}
@Override
protected void writeProperties(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException {
getSlideShowImpl().writePropertiesImpl(outFS, writtenEntries);
}
@Override
protected void validateInPlaceWritePossible() throws IllegalStateException {
getSlideShowImpl().validateInPlaceWritePossibleImpl();
}
@Override
public DirectoryNode getDirectory() {
return getSlideShowImpl().getDirectory();
}
@Override
protected void clearDirectory() {
getSlideShowImpl().clearDirectoryImpl();
}
@Override
protected boolean initDirectory() {
return getSlideShowImpl().initDirectoryImpl();
}
@Override
protected void replaceDirectory(DirectoryNode newDirectory) {
getSlideShowImpl().replaceDirectoryImpl(newDirectory);
}
@Override
protected String getEncryptedPropertyStreamName() {
return getSlideShowImpl().getEncryptedPropertyStreamName();
}
@Override
public EncryptionInfo getEncryptionInfo() throws IOException {
return getSlideShowImpl().getEncryptionInfo();
}
} }

View File

@ -36,6 +36,7 @@ import java.util.NavigableMap;
import java.util.TreeMap; import java.util.TreeMap;
import org.apache.poi.POIDocument; import org.apache.poi.POIDocument;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException; import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException;
import org.apache.poi.hslf.exceptions.HSLFException; import org.apache.poi.hslf.exceptions.HSLFException;
import org.apache.poi.hslf.exceptions.OldPowerPointFormatException; import org.apache.poi.hslf.exceptions.OldPowerPointFormatException;
@ -714,8 +715,6 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
} }
/* ******************* adding methods follow ********************* */ /* ******************* adding methods follow ********************* */
/** /**
@ -850,6 +849,38 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
return "EncryptedSummary"; return "EncryptedSummary";
} }
void writePropertiesImpl() throws IOException {
super.writeProperties();
}
PropertySet getPropertySetImpl(String setName) throws IOException {
return super.getPropertySet(setName);
}
PropertySet getPropertySetImpl(String setName, EncryptionInfo encryptionInfo) throws IOException {
return super.getPropertySet(setName, encryptionInfo);
}
void writePropertiesImpl(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException {
super.writeProperties(outFS, writtenEntries);
}
void validateInPlaceWritePossibleImpl() throws IllegalStateException {
super.validateInPlaceWritePossible();
}
void clearDirectoryImpl() {
super.clearDirectory();
}
boolean initDirectoryImpl() {
return super.initDirectory();
}
void replaceDirectoryImpl(DirectoryNode newDirectory) {
super.replaceDirectory(newDirectory);
}
private static class BufAccessBAOS extends ByteArrayOutputStream { private static class BufAccessBAOS extends ByteArrayOutputStream {
public byte[] getBuf() { public byte[] getBuf() {
return buf; return buf;

View File

@ -1,61 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hsmf.extractor;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.Removal;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
/**
* A text extractor for HSMF (Outlook) .msg files.
* Outputs in a format somewhat like a plain text email.
*
* @deprecated use @{link OutlookTextExtractor} instead
*/
@Deprecated
@Removal(version = "5.0.0")
public class OutlookTextExtactor extends OutlookTextExtractor {
public OutlookTextExtactor(MAPIMessage msg) {
super(msg);
}
public OutlookTextExtactor(DirectoryNode poifsDir) throws IOException {
super(new MAPIMessage(poifsDir));
}
public OutlookTextExtactor(POIFSFileSystem fs) throws IOException {
super(new MAPIMessage(fs));
}
public OutlookTextExtactor(InputStream inp) throws IOException {
super(new MAPIMessage(inp));
}
public static void main(String[] args) throws Exception {
for (String filename : args) {
try (POIFSFileSystem poifs = new POIFSFileSystem(new File(filename));
OutlookTextExtractor extractor = new OutlookTextExtractor(poifs)) {
System.out.println(extractor.getText());
}
}
}
}

View File

@ -42,9 +42,12 @@ import org.apache.poi.util.LocaleUtil;
* *
* @since 4.1.2 * @since 4.1.2
*/ */
public class OutlookTextExtractor extends POIOLE2TextExtractor { public class OutlookTextExtractor implements POIOLE2TextExtractor {
private final MAPIMessage msg;
private boolean doCloseFilesystem = true;
public OutlookTextExtractor(MAPIMessage msg) { public OutlookTextExtractor(MAPIMessage msg) {
super(msg); this.msg = msg;
} }
public OutlookTextExtractor(DirectoryNode poifsDir) throws IOException { public OutlookTextExtractor(DirectoryNode poifsDir) throws IOException {
@ -76,14 +79,13 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor {
* Returns the underlying MAPI message * Returns the underlying MAPI message
*/ */
public MAPIMessage getMAPIMessage() { public MAPIMessage getMAPIMessage() {
return (MAPIMessage) document; return msg;
} }
/** /**
* Outputs something a little like a RFC822 email * Outputs something a little like a RFC822 email
*/ */
public String getText() { public String getText() {
MAPIMessage msg = (MAPIMessage) document;
StringBuilder s = new StringBuilder(); StringBuilder s = new StringBuilder();
// See if we can get a suitable encoding for any // See if we can get a suitable encoding for any
@ -201,4 +203,24 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor {
} }
s.append("\n"); s.append("\n");
} }
@Override
public MAPIMessage getDocument() {
return msg;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public MAPIMessage getFilesystem() {
return msg;
}
} }

View File

@ -36,8 +36,9 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* *
* @author Nick Burch * @author Nick Burch
*/ */
public final class Word6Extractor extends POIOLE2TextExtractor { public final class Word6Extractor implements POIOLE2TextExtractor {
private HWPFOldDocument doc; private HWPFOldDocument doc;
private boolean doCloseFilesystem = true;
/** /**
* Create a new Word Extractor * Create a new Word Extractor
@ -53,8 +54,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
* @param fs * @param fs
* POIFSFileSystem containing the word file * POIFSFileSystem containing the word file
*/ */
public Word6Extractor( POIFSFileSystem fs ) throws IOException public Word6Extractor( POIFSFileSystem fs ) throws IOException {
{
this( fs.getRoot() ); this( fs.getRoot() );
} }
@ -62,14 +62,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
* @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
*/ */
@Deprecated @Deprecated
public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException {
throws IOException
{
this( dir ); this( dir );
} }
public Word6Extractor( DirectoryNode dir ) throws IOException public Word6Extractor( DirectoryNode dir ) throws IOException {
{
this( new HWPFOldDocument( dir ) ); this( new HWPFOldDocument( dir ) );
} }
@ -78,7 +75,6 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
* @param doc The HWPFOldDocument to extract from * @param doc The HWPFOldDocument to extract from
*/ */
public Word6Extractor(HWPFOldDocument doc) { public Word6Extractor(HWPFOldDocument doc) {
super(doc);
this.doc = doc; this.doc = doc;
} }
@ -111,25 +107,40 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
return ret; return ret;
} }
public String getText() public String getText() {
{ try {
try
{
WordToTextConverter wordToTextConverter = new WordToTextConverter(); WordToTextConverter wordToTextConverter = new WordToTextConverter();
wordToTextConverter.processDocument( doc ); wordToTextConverter.processDocument( doc );
return wordToTextConverter.getText(); return wordToTextConverter.getText();
} } catch ( Exception exc ) {
catch ( Exception exc )
{
// fall-back // fall-back
StringBuilder text = new StringBuilder(); StringBuilder text = new StringBuilder();
for ( String t : getParagraphText() ) for ( String t : getParagraphText() ) {
{
text.append( t ); text.append( t );
} }
return text.toString(); return text.toString();
} }
} }
@Override
public HWPFOldDocument getDocument() {
return doc;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public HWPFOldDocument getFilesystem() {
return doc;
}
} }

View File

@ -17,7 +17,6 @@
package org.apache.poi.hwpf.extractor; package org.apache.poi.hwpf.extractor;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -39,8 +38,9 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* *
* @author Nick Burch * @author Nick Burch
*/ */
public final class WordExtractor extends POIOLE2TextExtractor { public final class WordExtractor implements POIOLE2TextExtractor {
private HWPFDocument doc; private final HWPFDocument doc;
private boolean doCloseFilesystem = true;
/** /**
* Create a new Word Extractor * Create a new Word Extractor
@ -73,29 +73,9 @@ public final class WordExtractor extends POIOLE2TextExtractor {
* The HWPFDocument to extract from * The HWPFDocument to extract from
*/ */
public WordExtractor( HWPFDocument doc ) { public WordExtractor( HWPFDocument doc ) {
super( doc );
this.doc = doc; this.doc = doc;
} }
/**
* Command line extractor, so people will stop moaning that they can't just
* run this.
*/
public static void main( String[] args ) throws IOException {
if ( args.length == 0 ) {
System.err.println( "Use:" );
System.err
.println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
System.exit( 1 );
}
// Process the first argument as a file
InputStream fin = new FileInputStream( args[0] );
try (WordExtractor extractor = new WordExtractor(fin)) {
System.out.println(extractor.getText());
}
}
/** /**
* Get the text from the word file, as an array with one String per * Get the text from the word file, as an array with one String per
* paragraph * paragraph
@ -142,7 +122,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
return getParagraphText( r ); return getParagraphText( r );
} }
protected static String[] getParagraphText( Range r ) { static String[] getParagraphText( Range r ) {
String[] ret; String[] ret;
ret = new String[r.numParagraphs()]; ret = new String[r.numParagraphs()];
for ( int i = 0; i < ret.length; i++ ) { for ( int i = 0; i < ret.length; i++ ) {
@ -287,8 +267,27 @@ public final class WordExtractor extends POIOLE2TextExtractor {
/** /**
* Removes any fields (eg macros, page markers etc) from the string. * Removes any fields (eg macros, page markers etc) from the string.
*/ */
public static String stripFields( String text ) public static String stripFields( String text ) {
{
return Range.stripFields( text ); return Range.stripFields( text );
} }
@Override
public HWPFDocument getDocument() {
return doc;
}
@Override
public void setCloseFilesystem(boolean doCloseFilesystem) {
this.doCloseFilesystem = doCloseFilesystem;
}
@Override
public boolean isCloseFilesystem() {
return doCloseFilesystem;
}
@Override
public HWPFDocument getFilesystem() {
return doc;
}
} }

View File

@ -19,12 +19,9 @@ package org.apache.poi.hdgf.extractor;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.PrintStream;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.hdgf.HDGFDiagram; import org.apache.poi.hdgf.HDGFDiagram;
@ -32,7 +29,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.Test; import org.junit.Test;
public final class TestVisioExtractor { public final class TestVisioExtractor {
private static POIDataSamples _dgTests = POIDataSamples.getDiagramInstance(); private static final POIDataSamples _dgTests = POIDataSamples.getDiagramInstance();
private final String defFilename = "Test_Visio-Some_Random_Text.vsd"; private final String defFilename = "Test_Visio-Some_Random_Text.vsd";
private final int defTextChunks = 5; private final int defTextChunks = 5;
@ -108,31 +105,6 @@ public final class TestVisioExtractor {
} }
} }
@Test
public void testMain() throws Exception {
PrintStream oldOut = System.out;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
PrintStream capture = new PrintStream(baos);
System.setOut(capture);
String path = _dgTests.getFile(defFilename).getPath();
VisioTextExtractor.main(new String[] {path});
// Put things back
System.setOut(oldOut);
// Check
capture.flush();
String text = baos.toString();
// YK: stdout can contain lots of other stuff if logging is sent to console
// ( -Dorg.apache.poi.util.POILogger=org.apache.poi.util.SystemOutLogger)
assertTrue( text.contains(
"text\nView\n" +
"Test View\nI am a test view\n" +
"Some random text, on a page\n"
));
}
private VisioTextExtractor openExtractor(String fileName) throws IOException { private VisioTextExtractor openExtractor(String fileName) throws IOException {
try (InputStream is = _dgTests.openResourceAsStream(fileName)) { try (InputStream is = _dgTests.openResourceAsStream(fileName)) {
return new VisioTextExtractor(is); return new VisioTextExtractor(is);

View File

@ -42,7 +42,6 @@ import org.apache.poi.hsmf.datatypes.PropertyValue;
import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue; import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue;
import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue; import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue;
import org.apache.poi.hsmf.dev.HSMFDump; import org.apache.poi.hsmf.dev.HSMFDump;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hsmf.extractor.OutlookTextExtractor; import org.apache.poi.hsmf.extractor.OutlookTextExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LocaleUtil; import org.apache.poi.util.LocaleUtil;
@ -144,30 +143,20 @@ public final class TestFixedSizedProperties {
@Test @Test
public void testReadMessageDateSucceedsWithOutlookTextExtractor() throws Exception { public void testReadMessageDateSucceedsWithOutlookTextExtractor() throws Exception {
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageSucceeds); OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageSucceeds);
ext.setFilesystem(null); // Don't close re-used test resources here ext.setCloseFilesystem(false);
String text = ext.getText(); String text = ext.getText();
assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n"); assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n");
ext.close(); ext.close();
} }
@Test
public void testReadMessageDateSucceedsWithOutlookTextExtactor() throws Exception {
OutlookTextExtactor ext = new OutlookTextExtactor(mapiMessageSucceeds);
ext.setFilesystem(null); // Don't close re-used test resources here
String text = ext.getText();
assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n");
ext.close();
}
/** /**
* Test to see if we can read the Date Chunk with OutlookTextExtractor. * Test to see if we can read the Date Chunk with OutlookTextExtractor.
*/ */
@Test @Test
public void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception { public void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception {
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageFails); OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageFails);
ext.setFilesystem(null); // Don't close re-used test resources here ext.setCloseFilesystem(false);
String text = ext.getText(); String text = ext.getText();
assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n"); assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n");

View File

@ -20,7 +20,6 @@ package org.apache.poi.hsmf.extractor;
import static org.apache.poi.POITestCase.assertContains; import static org.apache.poi.POITestCase.assertContains;
import static org.apache.poi.POITestCase.assertNotContained; import static org.apache.poi.POITestCase.assertNotContained;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
@ -57,68 +56,62 @@ public final class TestOutlookTextExtractor {
@Test @Test
public void testQuick() throws Exception { public void testQuick() throws Exception {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true);
MAPIMessage msg = new MAPIMessage(poifs); MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
String text = ext.getText();
OutlookTextExtractor ext = new OutlookTextExtractor(msg); assertContains(text, "From: Kevin Roast\n");
String text = ext.getText(); assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n");
assertNotContained(text, "CC:");
assertContains(text, "From: Kevin Roast\n"); assertNotContained(text, "BCC:");
assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n"); assertNotContained(text, "Attachment:");
assertNotContained(text, "CC:"); assertContains(text, "Subject: Test the content transformer\n");
assertNotContained(text, "BCC:"); Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55);
assertNotContained(text, "Attachment:"); SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
assertContains(text, "Subject: Test the content transformer\n"); f.setTimeZone(LocaleUtil.getUserTimeZone());
Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55); String dateText = f.format(cal.getTime());
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT); assertContains(text, "Date: " + dateText + "\n");
f.setTimeZone(LocaleUtil.getUserTimeZone()); assertContains(text, "The quick brown fox jumps over the lazy dog");
String dateText = f.format(cal.getTime()); }
assertContains(text, "Date: " + dateText + "\n");
assertContains(text, "The quick brown fox jumps over the lazy dog");
ext.close();
poifs.close();
} }
@Test @Test
public void testSimple() throws Exception { public void testSimple() throws Exception {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
MAPIMessage msg = new MAPIMessage(poifs); MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
String text = ext.getText();
OutlookTextExtractor ext = new OutlookTextExtractor(msg); assertContains(text, "From: Travis Ferguson\n");
String text = ext.getText(); assertContains(text, "To: travis@overwrittenstack.com\n");
assertNotContained(text, "CC:");
assertContains(text, "From: Travis Ferguson\n"); assertNotContained(text, "BCC:");
assertContains(text, "To: travis@overwrittenstack.com\n"); assertContains(text, "Subject: test message\n");
assertNotContained(text, "CC:"); assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n");
assertNotContained(text, "BCC:"); assertContains(text, "This is a test message.");
assertContains(text, "Subject: test message\n"); }
assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n");
assertContains(text, "This is a test message.");
ext.close();
poifs.close();
} }
@Test @Test
public void testConstructors() throws Exception { public void testConstructors() throws Exception {
FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); String inp;
OutlookTextExtractor ext = new OutlookTextExtractor(fis); try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
String inp = ext.getText(); OutlookTextExtractor ext = new OutlookTextExtractor(fis)) {
ext.close(); inp = ext.getText();
fis.close(); }
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true); String poifsTxt;
ext = new OutlookTextExtractor(poifs); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
String poifsTxt = ext.getText(); OutlookTextExtractor ext = new OutlookTextExtractor(poifs)){
ext.close(); poifsTxt = ext.getText();
poifs.close(); }
fis = new FileInputStream(samples.getFile("simple_test_msg.msg")); String mapi;
ext = new OutlookTextExtractor(new MAPIMessage(fis)); try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
String mapi = ext.getText(); OutlookTextExtractor ext = new OutlookTextExtractor(new MAPIMessage(fis))) {
ext.close(); mapi = ext.getText();
fis.close(); }
assertEquals(inp, poifsTxt); assertEquals(inp, poifsTxt);
assertEquals(inp, mapi); assertEquals(inp, mapi);
@ -142,25 +135,22 @@ public final class TestOutlookTextExtractor {
"example_sent_regular.msg", "example_sent_unicode.msg" "example_sent_regular.msg", "example_sent_unicode.msg"
}; };
for (String file : files) { for (String file : files) {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
MAPIMessage msg = new MAPIMessage(poifs); MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
String text = ext.getText();
OutlookTextExtractor ext = new OutlookTextExtractor(msg); assertContains(text, "From: Mike Farman\n");
String text = ext.getText(); assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
assertContains(text, "From: Mike Farman\n"); assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " +
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + "'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " +
assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " + "'Vonka Jan' <jan.vonka@alfresco.com>\n");
"'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); assertContains(text, "Subject: This is a test message please ignore\n");
assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " + assertContains(text, "Date:");
"'Vonka Jan' <jan.vonka@alfresco.com>\n"); assertContains(text, "The quick brown fox jumps over the lazy dog");
assertContains(text, "Subject: This is a test message please ignore\n"); }
assertContains(text, "Date:");
assertContains(text, "The quick brown fox jumps over the lazy dog");
ext.close();
poifs.close();
} }
} }
@ -182,25 +172,21 @@ public final class TestOutlookTextExtractor {
"example_received_regular.msg", "example_received_unicode.msg" "example_received_regular.msg", "example_received_unicode.msg"
}; };
for (String file : files) { for (String file : files) {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
MAPIMessage msg = new MAPIMessage(poifs); MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
String text = ext.getText();
assertContains(text, "From: Mike Farman\n");
OutlookTextExtractor ext = new OutlookTextExtractor(msg); assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
String text = ext.getText(); "'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
assertContains(text, "CC: nickb@alfresco.com; " +
assertContains(text, "From: Mike Farman\n"); "nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " + assertNotContained(text, "BCC:");
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n"); assertContains(text, "Subject: This is a test message please ignore\n");
assertContains(text, "CC: nickb@alfresco.com; " + assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n"); assertContains(text, "The quick brown fox jumps over the lazy dog");
assertNotContained(text, "BCC:"); }
assertContains(text, "Subject: This is a test message please ignore\n");
assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly
assertContains(text, "The quick brown fox jumps over the lazy dog");
ext.close();
poifs.close();
} }
} }
@ -210,85 +196,59 @@ public final class TestOutlookTextExtractor {
@SuppressWarnings("JavadocReference") @SuppressWarnings("JavadocReference")
@Test @Test
public void testWithAttachments() throws Exception { public void testWithAttachments() throws Exception {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true);
MAPIMessage msg = new MAPIMessage(poifs); MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtractor ext = new OutlookTextExtractor(msg); OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
// Check the normal bits // Check the normal bits
String text = ext.getText(); String text = ext.getText();
assertContains(text, "From: Nicolas1"); assertContains(text, "From: Nicolas1");
assertContains(text, "To: 'nicolas1.23456@free.fr'"); assertContains(text, "To: 'nicolas1.23456@free.fr'");
assertNotContained(text, "CC:"); assertNotContained(text, "CC:");
assertNotContained(text, "BCC:"); assertNotContained(text, "BCC:");
assertContains(text, "Subject: test"); assertContains(text, "Subject: test");
assertContains(text, "Date: Wed, 22 Apr"); assertContains(text, "Date: Wed, 22 Apr");
assertContains(text, "Attachment: test-unicode.doc\n"); assertContains(text, "Attachment: test-unicode.doc\n");
assertContains(text, "Attachment: pj1.txt\n"); assertContains(text, "Attachment: pj1.txt\n");
assertContains(text, "contenu"); assertContains(text, "contenu");
// Embeded bits are checked in // Embeded bits are checked in
// TestExtractorFactory // TestExtractorFactory
}
ext.close();
poifs.close();
} }
@Test @Test
public void testWithAttachedMessage() throws Exception { public void testWithAttachedMessage() throws Exception {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true);
MAPIMessage msg = new MAPIMessage(poifs); MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtractor ext = new OutlookTextExtractor(msg); OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
String text = ext.getText(); String text = ext.getText();
// Check we got bits from the main message // Check we got bits from the main message
assertContains(text, "Master mail"); assertContains(text, "Master mail");
assertContains(text, "ante in lacinia euismod"); assertContains(text, "ante in lacinia euismod");
// But not the attached message // But not the attached message
assertNotContained(text, "Test mail attachment"); assertNotContained(text, "Test mail attachment");
assertNotContained(text, "Lorem ipsum dolor sit"); assertNotContained(text, "Lorem ipsum dolor sit");
}
ext.close();
poifs.close();
} }
@Test @Test
public void testEncodings() throws Exception { public void testEncodings() throws Exception {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true); try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
MAPIMessage msg = new MAPIMessage(poifs); MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtractor ext = new OutlookTextExtractor(msg); OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
String text = ext.getText(); String text = ext.getText();
// Check the english bits // Check the english bits
assertContains(text, "From: Tests Chang@FT"); assertContains(text, "From: Tests Chang@FT");
assertContains(text, "tests.chang@fengttt.com"); assertContains(text, "tests.chang@fengttt.com");
// And check some chinese bits // And check some chinese bits
assertContains(text, "(\u5f35\u6bd3\u502b)"); assertContains(text, "(\u5f35\u6bd3\u502b)");
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )"); assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
}
ext.close();
poifs.close();
}
@Test
public void testEncodingsDeprecatedClass() throws Exception {
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
MAPIMessage msg = new MAPIMessage(poifs);
OutlookTextExtactor ext = new OutlookTextExtactor(msg);
assertTrue("OutlookTextExtactor instanceof OutlookTextExtractor", ext instanceof OutlookTextExtractor);
String text = ext.getText();
// Check the english bits
assertContains(text, "From: Tests Chang@FT");
assertContains(text, "tests.chang@fengttt.com");
// And check some chinese bits
assertContains(text, "(\u5f35\u6bd3\u502b)");
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
ext.close();
poifs.close();
} }
} }

View File

@ -17,16 +17,16 @@
package org.apache.poi.hwpf.extractor; package org.apache.poi.hwpf.extractor;
import org.apache.poi.POIDataSamples; import static org.junit.Assert.assertNotNull;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.extractor.OLE2ExtractorFactory;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.Test;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import static org.junit.Assert.assertNotNull; import org.apache.poi.POIDataSamples;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.Test;
/** /**
* Tests for bugs with the WordExtractor * Tests for bugs with the WordExtractor
@ -61,7 +61,7 @@ public final class TestWordExtractorBugs {
@Test @Test
public void testBug60374() throws Exception { public void testBug60374() throws Exception {
POIFSFileSystem fs = new POIFSFileSystem(SAMPLES.openResourceAsStream("cn.orthodox.www_divenbog_APRIL_30-APRIL.DOC")); POIFSFileSystem fs = new POIFSFileSystem(SAMPLES.openResourceAsStream("cn.orthodox.www_divenbog_APRIL_30-APRIL.DOC"));
final POITextExtractor extractor = OLE2ExtractorFactory.createExtractor(fs); final POITextExtractor extractor = ExtractorFactory.createExtractor(fs);
// Check it gives text without error // Check it gives text without error
assertNotNull(extractor.getText()); assertNotNull(extractor.getText());

View File

@ -25,7 +25,7 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.POIDataSamples; import org.apache.poi.POIDataSamples;
import org.apache.poi.hpsf.*; import org.apache.poi.hpsf.Thumbnail;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.usermodel.HSSFWorkbook;
@ -101,42 +101,31 @@ public final class TestHPSFPropertiesExtractor {
@Test @Test
public void testConstructors() throws IOException { public void testConstructors() throws IOException {
POIFSFileSystem fs;
HSSFWorkbook wb;
try {
fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls"));
wb = new HSSFWorkbook(fs);
} catch (IOException e) {
throw new RuntimeException(e);
}
ExcelExtractor excelExt = new ExcelExtractor(wb);
final String fsText; final String fsText;
HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs);
fsExt.setFilesystem(null); // Don't close re-used test resources!
try {
fsText = fsExt.getText();
} finally {
fsExt.close();
}
final String hwText; final String hwText;
HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb);
hwExt.setFilesystem(null); // Don't close re-used test resources!
try {
hwText = hwExt.getText();
} finally {
hwExt.close();
}
final String eeText; final String eeText;
HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt);
eeExt.setFilesystem(null); // Don't close re-used test resources! try (POIFSFileSystem fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls"));
try { HSSFWorkbook wb = new HSSFWorkbook(fs);
eeText = eeExt.getText(); ExcelExtractor excelExt = new ExcelExtractor(wb)) {
} finally {
eeExt.close(); try (HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs)) {
wb.close(); // Don't close re-used test resources!
fsExt.setCloseFilesystem(false);
fsText = fsExt.getText();
}
try (HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb)) {
// Don't close re-used test resources!
hwExt.setCloseFilesystem(false);
hwText = hwExt.getText();
}
try (HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt)) {
// Don't close re-used test resources!
eeExt.setCloseFilesystem(false);
eeText = eeExt.getText();
}
} }
assertEquals(fsText, hwText); assertEquals(fsText, hwText);

View File

@ -43,9 +43,7 @@ public final class TestExcelExtractor {
private static ExcelExtractor createExtractor(String sampleFileName) throws IOException { private static ExcelExtractor createExtractor(String sampleFileName) throws IOException {
File file = HSSFTestDataSamples.getSampleFile(sampleFileName); File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
POIFSFileSystem fs = new POIFSFileSystem(file); POIFSFileSystem fs = new POIFSFileSystem(file);
ExcelExtractor extractor = new ExcelExtractor(fs); return new ExcelExtractor(fs);
extractor.setFilesystem(fs);
return extractor;
} }
@Test @Test