mirror of https://github.com/apache/poi.git
#64411 - Provide JigSaw modules
- rework extractors - see bugzilla entry for more information git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1880839 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4bf968d6bd
commit
dfdf9e6d6f
|
@ -29,11 +29,11 @@ import java.util.HashSet;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.POIOLE2TextExtractor;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
|
||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.ss.extractor.ExcelExtractor;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
|
|
|
@ -23,7 +23,7 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFSlideShow;
|
||||
|
@ -53,7 +53,8 @@ public class XSLFFileHandler extends SlideShowHandler {
|
|||
|
||||
// additionally try the other getText() methods
|
||||
|
||||
try (SlideShowExtractor extractor = ExtractorFactory.createExtractor(file)) {
|
||||
//noinspection rawtypes
|
||||
try (SlideShowExtractor extractor = (SlideShowExtractor) ExtractorFactory.createExtractor(file)) {
|
||||
assertNotNull(extractor);
|
||||
extractor.setSlidesByDefault(true);
|
||||
extractor.setNotesByDefault(true);
|
||||
|
|
|
@ -0,0 +1,304 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.extractor;
|
||||
|
||||
import static org.apache.poi.hssf.record.crypto.Biff8EncryptionKey.getCurrentUserPassword;
|
||||
import static org.apache.poi.poifs.crypt.EncryptionInfo.ENCRYPTION_INFO_ENTRY;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ServiceLoader;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.poi.EmptyFileException;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.poifs.crypt.Decryptor;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.FileMagic;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* Figures out the correct POIOLE2TextExtractor for your supplied
|
||||
* document, and returns it.
|
||||
*
|
||||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
|
||||
* not present on the runtime classpath</p>
|
||||
* <p>Note 2 - for text extractor creation across all formats, use
|
||||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
|
||||
* the OOXML jar.</p>
|
||||
* <p>Note 3 - rather than using this, for most cases you would be better
|
||||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
|
||||
*/
|
||||
@SuppressWarnings({"WeakerAccess", "JavadocReference"})
|
||||
public final class ExtractorFactory {
|
||||
private static final POILogger LOGGER = POILogFactory.getLogger(ExtractorFactory.class);
|
||||
|
||||
/** Should this thread prefer event based over usermodel based extractors? */
|
||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
|
||||
|
||||
/** Should all threads prefer event based over usermodel based extractors? */
|
||||
private static Boolean allPreferEventExtractors;
|
||||
|
||||
|
||||
private static class Singleton {
|
||||
private static final ExtractorFactory INSTANCE = new ExtractorFactory();
|
||||
}
|
||||
|
||||
private interface ProviderMethod {
|
||||
POITextExtractor create(ExtractorProvider prov) throws IOException;
|
||||
}
|
||||
|
||||
private final List<ExtractorProvider> provider = new ArrayList<>();
|
||||
|
||||
|
||||
private ExtractorFactory() {
|
||||
ServiceLoader.load(ExtractorProvider.class).forEach(provider::add);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
*
|
||||
* @return true if event extractors should be preferred in the current thread, fals otherwise.
|
||||
*/
|
||||
public static boolean getThreadPrefersEventExtractors() {
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
*
|
||||
* @return true if event extractors should be preferred in all threads, fals otherwise.
|
||||
*/
|
||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||
return allPreferEventExtractors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* Will only be used if the All Threads setting is null.
|
||||
*
|
||||
* @param preferEventExtractors If this threads should prefer event based extractors.
|
||||
*/
|
||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||
threadPreferEventExtractors.set(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* If set, will take preference over the Thread level setting.
|
||||
*
|
||||
* @param preferEventExtractors If all threads should prefer event based extractors.
|
||||
*/
|
||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||
allPreferEventExtractors = preferEventExtractors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread use event based extractors is available?
|
||||
* Checks the all-threads one first, then thread specific.
|
||||
*
|
||||
* @return If the current thread should use event based extractors.
|
||||
*/
|
||||
public static boolean getPreferEventExtractor() {
|
||||
return (allPreferEventExtractors != null) ? allPreferEventExtractors : threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(POIFSFileSystem fs) throws IOException {
|
||||
return createExtractor(fs, getCurrentUserPassword());
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(POIFSFileSystem fs, String password) throws IOException {
|
||||
return createExtractor(fs.getRoot(), password);
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(InputStream input) throws IOException {
|
||||
return createExtractor(input, getCurrentUserPassword());
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(InputStream input, String password) throws IOException {
|
||||
final InputStream is = FileMagic.prepareToCheckMagic(input);
|
||||
byte[] emptyFileCheck = new byte[1];
|
||||
is.mark(emptyFileCheck.length);
|
||||
if (is.read(emptyFileCheck) < emptyFileCheck.length) {
|
||||
throw new EmptyFileException();
|
||||
}
|
||||
is.reset();
|
||||
|
||||
final FileMagic fm = FileMagic.valueOf(is);
|
||||
if (FileMagic.OOXML == fm) {
|
||||
return wp(fm, w -> w.create(is, password));
|
||||
}
|
||||
|
||||
if (FileMagic.OLE2 != fm) {
|
||||
throw new IOException("Can't create extractor - unsupported file type: "+fm);
|
||||
}
|
||||
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(is);
|
||||
boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
|
||||
|
||||
return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(File file) throws IOException {
|
||||
return createExtractor(file, getCurrentUserPassword());
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(File file, String password) throws IOException {
|
||||
if (file.length() == 0) {
|
||||
throw new EmptyFileException();
|
||||
}
|
||||
|
||||
final FileMagic fm = FileMagic.valueOf(file);
|
||||
if (FileMagic.OOXML == fm) {
|
||||
return wp(fm, w -> w.create(file, password));
|
||||
}
|
||||
|
||||
if (FileMagic.OLE2 != fm) {
|
||||
throw new IOException("Can't create extractor - unsupported file type: "+fm);
|
||||
}
|
||||
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(file, true);
|
||||
try {
|
||||
boolean isOOXML = poifs.getRoot().hasEntry(ENCRYPTION_INFO_ENTRY);
|
||||
return wp(isOOXML ? FileMagic.OOXML : fm, w -> w.create(poifs.getRoot(), password));
|
||||
} catch (IOException | RuntimeException e) {
|
||||
IOUtils.closeQuietly(poifs);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
|
||||
* Note that this won't check for embedded OOXML resources either, use
|
||||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
|
||||
*
|
||||
* @param root The {@link DirectoryNode} pointing to a document.
|
||||
*
|
||||
* @return The resulting {@link POITextExtractor}, an exception is thrown if
|
||||
* no TextExtractor can be created for some reason.
|
||||
*
|
||||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
|
||||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
|
||||
* an unsupported version of Excel.
|
||||
* @throws IllegalArgumentException If creating the Extractor fails
|
||||
*/
|
||||
public static POITextExtractor createExtractor(DirectoryNode root) throws IOException {
|
||||
return createExtractor(root, getCurrentUserPassword());
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(final DirectoryNode root, String password) throws IOException {
|
||||
// Encrypted OOXML files go inside OLE2 containers, is this one?
|
||||
if (root.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY) || root.hasEntry("Package")) {
|
||||
return wp(FileMagic.OOXML, w -> w.create(root, password));
|
||||
} else {
|
||||
return wp(FileMagic.OLE2, w -> w.create(root, password));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embedded documents in the file (if there are any).
|
||||
* If there are no embedded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embedded file.
|
||||
*
|
||||
* @param ext The extractor to look at for embedded documents
|
||||
*
|
||||
* @return An array of resulting extractors. Empty if no embedded documents are found.
|
||||
*
|
||||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
|
||||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
|
||||
* an unsupported version of Excel.
|
||||
* @throws IllegalArgumentException If creating the Extractor fails
|
||||
*/
|
||||
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
||||
if (ext == null) {
|
||||
throw new IllegalStateException("extractor must be given");
|
||||
}
|
||||
|
||||
// All the embedded directories we spotted
|
||||
List<Entry> dirs = new ArrayList<>();
|
||||
// For anything else not directly held in as a POIFS directory
|
||||
List<InputStream> nonPOIFS = new ArrayList<>();
|
||||
|
||||
// Find all the embedded directories
|
||||
DirectoryEntry root = ext.getRoot();
|
||||
if(root == null) {
|
||||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||
}
|
||||
|
||||
if(ext instanceof ExcelExtractor) {
|
||||
// These are in MBD... under the root
|
||||
StreamSupport.stream(root.spliterator(), false)
|
||||
.filter(entry -> entry.getName().startsWith("MBD"))
|
||||
.forEach(dirs::add);
|
||||
} else {
|
||||
for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
|
||||
if (prov.accepts(FileMagic.OLE2)) {
|
||||
prov.identifyEmbeddedResources(ext, dirs, nonPOIFS);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create the extractors
|
||||
if(dirs.size() == 0 && nonPOIFS.size() == 0){
|
||||
return new POITextExtractor[0];
|
||||
}
|
||||
|
||||
ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
|
||||
for (Entry dir : dirs) {
|
||||
textExtractors.add(createExtractor((DirectoryNode) dir));
|
||||
}
|
||||
for (InputStream stream : nonPOIFS) {
|
||||
try {
|
||||
textExtractors.add(createExtractor(stream));
|
||||
} catch (IOException e) {
|
||||
// Ignore, just means it didn't contain a format we support as yet
|
||||
LOGGER.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
|
||||
}
|
||||
}
|
||||
return textExtractors.toArray(new POITextExtractor[0]);
|
||||
}
|
||||
|
||||
private static POITextExtractor wp(FileMagic fm, ProviderMethod fun) throws IOException {
|
||||
for (ExtractorProvider prov : Singleton.INSTANCE.provider) {
|
||||
if (prov.accepts(fm)) {
|
||||
POITextExtractor ext = fun.create(prov);
|
||||
if (ext != null) {
|
||||
return ext;
|
||||
}
|
||||
}
|
||||
}
|
||||
throw new IOException("Your InputStream was neither an OLE2 stream, nor an OOXML stream " +
|
||||
"or you haven't provide the poi-ooxml*.jar and/or poi-scratchpad*.jar in the classpath/modulepath - FileMagic: "+fm);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.FileMagic;
|
||||
|
||||
public interface ExtractorProvider {
|
||||
boolean accepts(FileMagic fm);
|
||||
|
||||
/**
|
||||
* Create Extractor via file
|
||||
* @param file the file
|
||||
* @param password the password or {@code null} if not encrypted
|
||||
* @return the extractor
|
||||
* @throws IOException if file can't be read or parsed
|
||||
*/
|
||||
POITextExtractor create(File file, String password) throws IOException;
|
||||
|
||||
/**
|
||||
* Create Extractor via InputStream
|
||||
* @param inputStream the stream
|
||||
* @param password the password or {@code null} if not encrypted
|
||||
* @return the extractor
|
||||
* @throws IOException if stream can't be read or parsed
|
||||
*/
|
||||
POITextExtractor create(InputStream inputStream, String password) throws IOException;
|
||||
|
||||
/**
|
||||
* Create Extractor from POIFS node
|
||||
* @param poifsDir the node
|
||||
* @param password the password or {@code null} if not encrypted
|
||||
* @return the extractor
|
||||
* @throws IOException if node can't be parsed
|
||||
*/
|
||||
POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embedded documents in the file (if there are any).
|
||||
* If there are no embedded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embedded file.
|
||||
*
|
||||
* @param ext the extractor holding the directory to start parsing
|
||||
* @param dirs a list to be filled with directory references holding embedded
|
||||
* @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
|
||||
*
|
||||
* @throws IOException when the format specific extraction fails because of invalid entires
|
||||
*/
|
||||
default void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
|
||||
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.extractor;
|
||||
|
||||
import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hssf.extractor.OldExcelExtractor;
|
||||
import org.apache.poi.hssf.model.InternalWorkbook;
|
||||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.FileMagic;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* ExtractorFactory for HSSF and Old Excel format
|
||||
*/
|
||||
public class MainExtractorFactory implements ExtractorProvider {
|
||||
@Override
|
||||
public boolean accepts(FileMagic fm) {
|
||||
return FileMagic.OLE2 == fm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor create(File file, String password) throws IOException {
|
||||
return create(new POIFSFileSystem(file, true).getRoot(), password);
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor create(InputStream inputStream, String password) throws IOException {
|
||||
return create(new POIFSFileSystem(inputStream).getRoot(), password);
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
|
||||
final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
|
||||
try {
|
||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||
|
||||
// Look for certain entries in the stream, to figure it out from
|
||||
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
|
||||
if (poifsDir.hasEntry(workbookName)) {
|
||||
return ExtractorFactory.getPreferEventExtractor() ? new EventBasedExcelExtractor(poifsDir) : new ExcelExtractor(poifsDir);
|
||||
}
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry(InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME)) {
|
||||
return new OldExcelExtractor(poifsDir);
|
||||
}
|
||||
} finally {
|
||||
Biff8EncryptionKey.setCurrentUserPassword(oldPW);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -1,279 +0,0 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.extractor;
|
||||
|
||||
import static org.apache.poi.hssf.model.InternalWorkbook.OLD_WORKBOOK_DIR_ENTRY_NAME;
|
||||
import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hssf.OldExcelFormatException;
|
||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* Figures out the correct POIOLE2TextExtractor for your supplied
|
||||
* document, and returns it.
|
||||
*
|
||||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
|
||||
* not present on the runtime classpath</p>
|
||||
* <p>Note 2 - for text extractor creation across all formats, use
|
||||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} contained within
|
||||
* the OOXML jar.</p>
|
||||
* <p>Note 3 - rather than using this, for most cases you would be better
|
||||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
|
||||
*/
|
||||
@SuppressWarnings({"WeakerAccess", "JavadocReference"})
|
||||
public final class OLE2ExtractorFactory {
|
||||
private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
|
||||
|
||||
/** Should this thread prefer event based over usermodel based extractors? */
|
||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = ThreadLocal.withInitial(() -> Boolean.FALSE);
|
||||
|
||||
/** Should all threads prefer event based over usermodel based extractors? */
|
||||
private static Boolean allPreferEventExtractors;
|
||||
|
||||
private OLE2ExtractorFactory() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
*
|
||||
* @return true if event extractors should be preferred in the current thread, fals otherwise.
|
||||
*/
|
||||
public static boolean getThreadPrefersEventExtractors() {
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
*
|
||||
* @return true if event extractors should be preferred in all threads, fals otherwise.
|
||||
*/
|
||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||
return allPreferEventExtractors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* Will only be used if the All Threads setting is null.
|
||||
*
|
||||
* @param preferEventExtractors If this threads should prefer event based extractors.
|
||||
*/
|
||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||
threadPreferEventExtractors.set(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* If set, will take preference over the Thread level setting.
|
||||
*
|
||||
* @param preferEventExtractors If all threads should prefer event based extractors.
|
||||
*/
|
||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||
allPreferEventExtractors = preferEventExtractors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread use event based extractors is available?
|
||||
* Checks the all-threads one first, then thread specific.
|
||||
*
|
||||
* @return If the current thread should use event based extractors.
|
||||
*/
|
||||
public static boolean getPreferEventExtractor() {
|
||||
if(allPreferEventExtractors != null) {
|
||||
return allPreferEventExtractors;
|
||||
}
|
||||
return threadPreferEventExtractors.get();
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException {
|
||||
return (T)createExtractor(fs.getRoot());
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T extends POITextExtractor> T createExtractor(InputStream input) throws IOException {
|
||||
Class<?> cls = getOOXMLClass();
|
||||
if (cls != null) {
|
||||
// Use Reflection to get us the full OOXML-enabled version
|
||||
try {
|
||||
Method m = cls.getDeclaredMethod("createExtractor", InputStream.class);
|
||||
return (T)m.invoke(null, input);
|
||||
} catch (IllegalArgumentException iae) {
|
||||
throw iae;
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Error creating Extractor for InputStream", e);
|
||||
}
|
||||
} else {
|
||||
// Best hope it's OLE2....
|
||||
return createExtractor(new POIFSFileSystem(input));
|
||||
}
|
||||
}
|
||||
|
||||
private static Class<?> getOOXMLClass() {
|
||||
try {
|
||||
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
|
||||
"org.apache.poi.extractor.ExtractorFactory"
|
||||
);
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
private static Class<?> getScratchpadClass() {
|
||||
try {
|
||||
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
|
||||
"org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory"
|
||||
);
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
|
||||
throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
|
||||
* Note that this won't check for embedded OOXML resources either, use
|
||||
* {@link org.apache.poi.ooxml.extractor.ExtractorFactory} for that.
|
||||
*
|
||||
* @param poifsDir The {@link DirectoryNode} pointing to a document.
|
||||
*
|
||||
* @return The resulting {@link POITextExtractor}, an exception is thrown if
|
||||
* no TextExtractor can be created for some reason.
|
||||
*
|
||||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
|
||||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
|
||||
* an unsupported version of Excel.
|
||||
* @throws IllegalArgumentException If creating the Extractor fails
|
||||
*/
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
|
||||
// Look for certain entries in the stream, to figure it
|
||||
// out from
|
||||
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
|
||||
if (poifsDir.hasEntry(workbookName)) {
|
||||
if (getPreferEventExtractor()) {
|
||||
return new EventBasedExcelExtractor(poifsDir);
|
||||
}
|
||||
return new ExcelExtractor(poifsDir);
|
||||
}
|
||||
}
|
||||
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
|
||||
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
|
||||
+ "found. Please call OldExcelExtractor directly for basic text extraction");
|
||||
}
|
||||
|
||||
// Ask Scratchpad, or fail trying
|
||||
Class<?> cls = getScratchpadClass();
|
||||
try {
|
||||
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
|
||||
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
|
||||
if (ext != null) return ext;
|
||||
} catch (IllegalArgumentException iae) {
|
||||
throw iae;
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embedded documents in the file (if there are any).
|
||||
* If there are no embedded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embedded file.
|
||||
*
|
||||
* @param ext The extractor to look at for embedded documents
|
||||
*
|
||||
* @return An array of resulting extractors. Empty if no embedded documents are found.
|
||||
*
|
||||
* @throws IOException If converting the {@link DirectoryNode} into a HSSFWorkbook fails
|
||||
* @throws OldFileFormatException If the {@link DirectoryNode} points to a format of
|
||||
* an unsupported version of Excel.
|
||||
* @throws IllegalArgumentException If creating the Extractor fails
|
||||
*/
|
||||
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
|
||||
// All the embedded directories we spotted
|
||||
List<Entry> dirs = new ArrayList<>();
|
||||
// For anything else not directly held in as a POIFS directory
|
||||
List<InputStream> nonPOIFS = new ArrayList<>();
|
||||
|
||||
// Find all the embedded directories
|
||||
DirectoryEntry root = ext.getRoot();
|
||||
if(root == null) {
|
||||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||
}
|
||||
|
||||
if(ext instanceof ExcelExtractor) {
|
||||
// These are in MBD... under the root
|
||||
Iterator<Entry> it = root.getEntries();
|
||||
while(it.hasNext()) {
|
||||
Entry entry = it.next();
|
||||
if(entry.getName().startsWith("MBD")) {
|
||||
dirs.add(entry);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Ask Scratchpad, or fail trying
|
||||
Class<?> cls = getScratchpadClass();
|
||||
try {
|
||||
Method m = cls.getDeclaredMethod(
|
||||
"identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
|
||||
m.invoke(null, ext, dirs, nonPOIFS);
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Create the extractors
|
||||
if(dirs.size() == 0 && nonPOIFS.size() == 0){
|
||||
return new POITextExtractor[0];
|
||||
}
|
||||
|
||||
ArrayList<POITextExtractor> e = new ArrayList<>();
|
||||
for (Entry dir : dirs) {
|
||||
e.add(createExtractor((DirectoryNode) dir
|
||||
));
|
||||
}
|
||||
for (InputStream stream : nonPOIFS) {
|
||||
try {
|
||||
e.add(createExtractor(stream));
|
||||
} catch (Exception xe) {
|
||||
// Ignore, invalid format
|
||||
LOGGER.log(POILogger.WARN, xe);
|
||||
}
|
||||
}
|
||||
return e.toArray(new POITextExtractor[0]);
|
||||
}
|
||||
}
|
|
@ -30,55 +30,28 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
|||
* org.apache.poi.[format].extractor .
|
||||
*
|
||||
* @see org.apache.poi.hssf.extractor.ExcelExtractor
|
||||
* @see org.apache.poi.hslf.extractor.PowerPointExtractor
|
||||
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
|
||||
* @see org.apache.poi.hwpf.extractor.WordExtractor
|
||||
*/
|
||||
public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
||||
/** The POIDocument that's open */
|
||||
protected POIDocument document;
|
||||
|
||||
/**
|
||||
* Creates a new text extractor for the given document
|
||||
*
|
||||
* @param document The POIDocument to use in this extractor.
|
||||
*/
|
||||
public POIOLE2TextExtractor(POIDocument document) {
|
||||
this.document = document;
|
||||
|
||||
// Ensure any underlying resources, such as open files,
|
||||
// will get cleaned up if the user calls #close()
|
||||
setFilesystem(document);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new text extractor, using the same
|
||||
* document as another text extractor. Normally
|
||||
* only used by properties extractors.
|
||||
*
|
||||
* @param otherExtractor the extractor which document to be used
|
||||
*/
|
||||
protected POIOLE2TextExtractor(POIOLE2TextExtractor otherExtractor) {
|
||||
this.document = otherExtractor.document;
|
||||
}
|
||||
|
||||
public interface POIOLE2TextExtractor extends POITextExtractor {
|
||||
/**
|
||||
* Returns the document information metadata for the document
|
||||
*
|
||||
* @return The Document Summary Information or null
|
||||
* if it could not be read for this document.
|
||||
*/
|
||||
public DocumentSummaryInformation getDocSummaryInformation() {
|
||||
return document.getDocumentSummaryInformation();
|
||||
default DocumentSummaryInformation getDocSummaryInformation() {
|
||||
return getDocument().getDocumentSummaryInformation();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the summary information metadata for the document.
|
||||
*
|
||||
* @return The Summary information for the document or null
|
||||
* if it could not be read for this document.
|
||||
*/
|
||||
public SummaryInformation getSummaryInformation() {
|
||||
return document.getSummaryInformation();
|
||||
default SummaryInformation getSummaryInformation() {
|
||||
return getDocument().getSummaryInformation();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -88,7 +61,7 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||
* @return an instance of POIExtractor that can extract meta-data.
|
||||
*/
|
||||
@Override
|
||||
public POITextExtractor getMetadataTextExtractor() {
|
||||
default POITextExtractor getMetadataTextExtractor() {
|
||||
return new HPSFPropertiesExtractor(this);
|
||||
}
|
||||
|
||||
|
@ -97,8 +70,8 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||
*
|
||||
* @return the DirectoryEntry that is associated with the POIDocument of this extractor.
|
||||
*/
|
||||
public DirectoryEntry getRoot() {
|
||||
return document.getDirectory();
|
||||
default DirectoryEntry getRoot() {
|
||||
return getDocument().getDirectory();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -107,7 +80,5 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
|
|||
* @return the underlying POIDocument
|
||||
*/
|
||||
@Override
|
||||
public POIDocument getDocument() {
|
||||
return document;
|
||||
}
|
||||
POIDocument getDocument();
|
||||
}
|
|
@ -27,13 +27,10 @@ import java.io.IOException;
|
|||
* org.apache.poi.[format].extractor .
|
||||
*
|
||||
* @see org.apache.poi.hssf.extractor.ExcelExtractor
|
||||
* @see org.apache.poi.hslf.extractor.PowerPointExtractor
|
||||
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
|
||||
* @see org.apache.poi.hwpf.extractor.WordExtractor
|
||||
*/
|
||||
public abstract class POITextExtractor implements Closeable {
|
||||
private Closeable fsToClose;
|
||||
|
||||
public interface POITextExtractor extends Closeable {
|
||||
/**
|
||||
* Retrieves all the text from the document.
|
||||
* How cells, paragraphs etc are separated in the text
|
||||
|
@ -41,7 +38,7 @@ public abstract class POITextExtractor implements Closeable {
|
|||
* a specific project for details.
|
||||
* @return All the text from the document
|
||||
*/
|
||||
public abstract String getText();
|
||||
String getText();
|
||||
|
||||
/**
|
||||
* Returns another text extractor, which is able to
|
||||
|
@ -50,16 +47,23 @@ public abstract class POITextExtractor implements Closeable {
|
|||
*
|
||||
* @return the metadata and text extractor
|
||||
*/
|
||||
public abstract POITextExtractor getMetadataTextExtractor();
|
||||
POITextExtractor getMetadataTextExtractor();
|
||||
|
||||
/**
|
||||
* Used to ensure file handle cleanup.
|
||||
*
|
||||
* @param fs filesystem to close
|
||||
* @param doCloseFilesystem {@code true} (default), if underlying resources/filesystem should be
|
||||
* closed on {@link #close()}
|
||||
*/
|
||||
public void setFilesystem(Closeable fs) {
|
||||
fsToClose = fs;
|
||||
}
|
||||
void setCloseFilesystem(boolean doCloseFilesystem);
|
||||
|
||||
/**
|
||||
* @return {@code true}, if resources/filesystem should be closed on {@link #close()}
|
||||
*/
|
||||
boolean isCloseFilesystem();
|
||||
|
||||
/**
|
||||
* @return The underlying resources/filesystem
|
||||
*/
|
||||
Closeable getFilesystem();
|
||||
|
||||
/**
|
||||
* Allows to free resources of the Extractor as soon as
|
||||
|
@ -69,14 +73,15 @@ public abstract class POITextExtractor implements Closeable {
|
|||
* The Extractor cannot be used after close has been called.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if(fsToClose != null) {
|
||||
fsToClose.close();
|
||||
default void close() throws IOException {
|
||||
Closeable fs = getFilesystem();
|
||||
if (isCloseFilesystem() && fs != null) {
|
||||
fs.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the processed document
|
||||
*/
|
||||
public abstract Object getDocument();
|
||||
Object getDocument();
|
||||
}
|
||||
|
|
|
@ -17,9 +17,6 @@
|
|||
|
||||
package org.apache.poi.hpsf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.extractor.POIOLE2TextExtractor;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
|
@ -37,15 +34,20 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
* build in and custom, returning them in
|
||||
* textual form.
|
||||
*/
|
||||
public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
|
||||
public class HPSFPropertiesExtractor implements POIOLE2TextExtractor {
|
||||
private final POIDocument document;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public HPSFPropertiesExtractor(POIOLE2TextExtractor mainExtractor) {
|
||||
super(mainExtractor);
|
||||
document = mainExtractor.getDocument();
|
||||
}
|
||||
public HPSFPropertiesExtractor(POIDocument doc) {
|
||||
super(doc);
|
||||
|
||||
public HPSFPropertiesExtractor(POIDocument document) {
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
public HPSFPropertiesExtractor(POIFSFileSystem fs) {
|
||||
super(new HPSFPropertiesOnlyDocument(fs));
|
||||
document = new HPSFPropertiesOnlyDocument(fs);
|
||||
}
|
||||
|
||||
public String getDocumentSummaryInformationText() {
|
||||
|
@ -137,12 +139,23 @@ public class HPSFPropertiesExtractor extends POIOLE2TextExtractor {
|
|||
return super.hashCode();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
for (String file : args) {
|
||||
try (HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(
|
||||
new POIFSFileSystem(new File(file)))) {
|
||||
System.out.println(ext.getText());
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public POIDocument getDocument() {
|
||||
return document;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public POIDocument getFilesystem() {
|
||||
return document;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
package org.apache.poi.hssf.extractor;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -37,9 +38,9 @@ import org.apache.poi.hssf.record.LabelRecord;
|
|||
import org.apache.poi.hssf.record.LabelSSTRecord;
|
||||
import org.apache.poi.hssf.record.NoteRecord;
|
||||
import org.apache.poi.hssf.record.NumberRecord;
|
||||
import org.apache.poi.hssf.record.Record;
|
||||
import org.apache.poi.hssf.record.SSTRecord;
|
||||
import org.apache.poi.hssf.record.StringRecord;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
|
@ -59,26 +60,28 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
*
|
||||
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
|
||||
*/
|
||||
public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
private DirectoryNode _dir;
|
||||
public class EventBasedExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
private final POIFSFileSystem poifs;
|
||||
private final DirectoryNode _dir;
|
||||
private boolean doCloseFilesystem = true;
|
||||
boolean _includeSheetNames = true;
|
||||
boolean _formulasNotResults;
|
||||
|
||||
public EventBasedExcelExtractor( DirectoryNode dir )
|
||||
{
|
||||
super( (POIDocument)null );
|
||||
public EventBasedExcelExtractor(DirectoryNode dir) {
|
||||
poifs = null;
|
||||
_dir = dir;
|
||||
}
|
||||
|
||||
public EventBasedExcelExtractor(POIFSFileSystem fs) {
|
||||
this(fs.getRoot());
|
||||
super.setFilesystem(fs);
|
||||
poifs = fs;
|
||||
_dir = fs.getRoot();
|
||||
}
|
||||
|
||||
/**
|
||||
* Would return the document information metadata for the document,
|
||||
* if we supported it
|
||||
*/
|
||||
@Override
|
||||
public DocumentSummaryInformation getDocSummaryInformation() {
|
||||
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
|
||||
}
|
||||
|
@ -86,6 +89,7 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or
|
|||
* Would return the summary information metadata for the document,
|
||||
* if we supported it
|
||||
*/
|
||||
@Override
|
||||
public SummaryInformation getSummaryInformation() {
|
||||
throw new IllegalStateException("Metadata extraction not supported in streaming mode, please use ExcelExtractor");
|
||||
}
|
||||
|
@ -262,4 +266,29 @@ public class EventBasedExcelExtractor extends POIOLE2TextExtractor implements or
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Closeable getFilesystem() {
|
||||
return poifs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public POIDocument getDocument() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DirectoryEntry getRoot() {
|
||||
return _dir;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,9 +53,10 @@ import org.apache.poi.ss.usermodel.Row.MissingCellPolicy;
|
|||
*
|
||||
* @see <a href="http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/hssf/eventusermodel/examples/XLS2CSVmra.java">XLS2CSVmra</a>
|
||||
*/
|
||||
public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
public class ExcelExtractor implements POIOLE2TextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
private final HSSFWorkbook _wb;
|
||||
private final HSSFDataFormatter _formatter;
|
||||
private boolean doCloseFilesystem = true;
|
||||
private boolean _includeSheetNames = true;
|
||||
private boolean _shouldEvaluateFormulas = true;
|
||||
private boolean _includeCellComments;
|
||||
|
@ -63,13 +64,14 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
private boolean _includeHeadersFooters = true;
|
||||
|
||||
public ExcelExtractor(HSSFWorkbook wb) {
|
||||
super(wb);
|
||||
_wb = wb;
|
||||
_formatter = new HSSFDataFormatter();
|
||||
}
|
||||
|
||||
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot());
|
||||
}
|
||||
|
||||
public ExcelExtractor(DirectoryNode dir) throws IOException {
|
||||
this(new HSSFWorkbook(dir, true));
|
||||
}
|
||||
|
@ -225,7 +227,7 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
|
||||
try (InputStream is = cmdArgs.getInputFile() == null ? System.in : new FileInputStream(cmdArgs.getInputFile());
|
||||
HSSFWorkbook wb = new HSSFWorkbook(is);
|
||||
ExcelExtractor extractor = new ExcelExtractor(wb);
|
||||
ExcelExtractor extractor = new ExcelExtractor(wb)
|
||||
) {
|
||||
extractor.setIncludeSheetNames(cmdArgs.shouldShowSheetNames());
|
||||
extractor.setFormulasNotResults(!cmdArgs.shouldEvaluateFormulas());
|
||||
|
@ -411,4 +413,24 @@ public class ExcelExtractor extends POIOLE2TextExtractor implements org.apache.p
|
|||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HSSFWorkbook getDocument() {
|
||||
return _wb;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HSSFWorkbook getFilesystem() {
|
||||
return _wb;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.hssf.OldExcelFormatException;
|
||||
import org.apache.poi.hssf.record.BOFRecord;
|
||||
import org.apache.poi.hssf.record.CodepageRecord;
|
||||
|
@ -58,7 +59,7 @@ import org.apache.poi.util.IOUtils;
|
|||
* by Apache Tika, but not really intended for display to the user.
|
||||
* </p>
|
||||
*/
|
||||
public class OldExcelExtractor implements Closeable {
|
||||
public class OldExcelExtractor implements POITextExtractor {
|
||||
|
||||
private final static int FILE_PASS_RECORD_SID = 0x2f;
|
||||
//arbitrarily selected; may need to increase
|
||||
|
@ -295,24 +296,39 @@ public class OldExcelExtractor implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
close();
|
||||
ris = null;
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
// some cases require this close here
|
||||
if(toClose != null) {
|
||||
IOUtils.closeQuietly(toClose);
|
||||
toClose = null;
|
||||
}
|
||||
}
|
||||
|
||||
protected void handleNumericCell(StringBuilder text, double value) {
|
||||
// TODO Need to fetch / use format strings
|
||||
text.append(value);
|
||||
text.append('\n');
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor getMetadataTextExtractor() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return toClose != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Closeable getFilesystem() {
|
||||
return toClose;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getDocument() {
|
||||
return ris;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,14 +54,14 @@ import org.apache.poi.util.POILogger;
|
|||
public class SlideShowExtractor<
|
||||
S extends Shape<S,P>,
|
||||
P extends TextParagraph<S,P,? extends TextRun>
|
||||
> extends POITextExtractor {
|
||||
> implements POITextExtractor {
|
||||
private static final POILogger LOG = POILogFactory.getLogger(SlideShowExtractor.class);
|
||||
|
||||
// placeholder text for slide numbers
|
||||
private static final String SLIDE_NUMBER_PH = "‹#›";
|
||||
|
||||
|
||||
private SlideShow<S,P> slideshow;
|
||||
protected final SlideShow<S,P> slideshow;
|
||||
|
||||
private boolean slidesByDefault = true;
|
||||
private boolean notesByDefault;
|
||||
|
@ -69,9 +69,9 @@ public class SlideShowExtractor<
|
|||
private boolean masterByDefault;
|
||||
|
||||
private Predicate<Object> filter = o -> true;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public SlideShowExtractor(final SlideShow<S,P> slideshow) {
|
||||
setFilesystem(slideshow);
|
||||
this.slideshow = slideshow;
|
||||
}
|
||||
|
||||
|
@ -81,8 +81,8 @@ public class SlideShowExtractor<
|
|||
* @return the opened document
|
||||
*/
|
||||
@Override
|
||||
public final Object getDocument() {
|
||||
return slideshow.getPersistDocument();
|
||||
public SlideShow<S,P> getDocument() {
|
||||
return slideshow;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -339,17 +339,17 @@ public class SlideShowExtractor<
|
|||
return raw;
|
||||
}
|
||||
|
||||
TextParagraph tp = tr.getParagraph();
|
||||
TextShape ps = (tp != null) ? tp.getParentShape() : null;
|
||||
Sheet sh = (ps != null) ? ps.getSheet() : null;
|
||||
String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide)sh).getSlideNumber() + 1) : "";
|
||||
TextParagraph<?,?,?> tp = tr.getParagraph();
|
||||
TextShape<?,?> ps = (tp != null) ? tp.getParentShape() : null;
|
||||
Sheet<?,?> sh = (ps != null) ? ps.getSheet() : null;
|
||||
String slideNr = (sh instanceof Slide) ? Integer.toString(((Slide<?,?>)sh).getSlideNumber() + 1) : "";
|
||||
|
||||
return raw.replace(SLIDE_NUMBER_PH, slideNr);
|
||||
}
|
||||
|
||||
private static String replaceTextCap(TextRun tr) {
|
||||
final TextParagraph tp = tr.getParagraph();
|
||||
final TextShape sh = (tp != null) ? tp.getParentShape() : null;
|
||||
final TextParagraph<?,?,?> tp = tr.getParagraph();
|
||||
final TextShape<?,?> sh = (tp != null) ? tp.getParentShape() : null;
|
||||
final Placeholder ph = (sh != null) ? sh.getPlaceholder() : null;
|
||||
|
||||
// 0xB acts like cariage return in page titles and like blank in the others
|
||||
|
@ -438,4 +438,19 @@ public class SlideShowExtractor<
|
|||
(italic == null || tr.isItalic() == italic) &&
|
||||
(bold == null || tr.isBold() == bold);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SlideShow<S,P> getFilesystem() {
|
||||
return getDocument();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ public interface ExcelExtractor {
|
|||
*
|
||||
* @param includeSheetNames {@code true} if the sheet names should be included
|
||||
*/
|
||||
public void setIncludeSheetNames(boolean includeSheetNames);
|
||||
void setIncludeSheetNames(boolean includeSheetNames);
|
||||
|
||||
/**
|
||||
* Should we return the formula itself, and not the result it produces?
|
||||
|
@ -35,7 +35,7 @@ public interface ExcelExtractor {
|
|||
*
|
||||
* @param formulasNotResults {@code true} if the formula itself is returned
|
||||
*/
|
||||
public void setFormulasNotResults(boolean formulasNotResults);
|
||||
void setFormulasNotResults(boolean formulasNotResults);
|
||||
|
||||
/**
|
||||
* Should headers and footers be included in the output?
|
||||
|
@ -43,7 +43,7 @@ public interface ExcelExtractor {
|
|||
*
|
||||
* @param includeHeadersFooters {@code true} if headers and footers should be included
|
||||
*/
|
||||
public void setIncludeHeadersFooters(boolean includeHeadersFooters);
|
||||
void setIncludeHeadersFooters(boolean includeHeadersFooters);
|
||||
|
||||
/**
|
||||
* Should cell comments be included?
|
||||
|
@ -51,12 +51,12 @@ public interface ExcelExtractor {
|
|||
*
|
||||
* @param includeCellComments {@code true} if cell comments should be included
|
||||
*/
|
||||
public void setIncludeCellComments(boolean includeCellComments);
|
||||
void setIncludeCellComments(boolean includeCellComments);
|
||||
|
||||
/**
|
||||
* Retrieves the text contents of the file
|
||||
*
|
||||
* @return the text contents of the file
|
||||
*/
|
||||
public String getText();
|
||||
String getText();
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -29,6 +29,7 @@ module org.apache.poi.ooxml {
|
|||
requires java.security.jgss;
|
||||
|
||||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
|
||||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
|
||||
|
||||
exports org.apache.poi.xwpf.extractor;
|
||||
exports org.apache.poi.xwpf.usermodel;
|
||||
|
|
Binary file not shown.
|
@ -29,6 +29,7 @@ module org.apache.poi.ooxml {
|
|||
requires java.security.jgss;
|
||||
|
||||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.xssf.usermodel.XSSFWorkbookFactory;
|
||||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
|
||||
|
||||
exports org.apache.poi.xwpf.extractor;
|
||||
exports org.apache.poi.xwpf.usermodel;
|
||||
|
|
Binary file not shown.
|
@ -28,8 +28,12 @@ module org.apache.poi.poi {
|
|||
requires jdk.unsupported;
|
||||
|
||||
uses org.apache.poi.ss.usermodel.WorkbookProvider;
|
||||
uses org.apache.poi.extractor.ExtractorProvider;
|
||||
|
||||
|
||||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
|
||||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
|
||||
|
||||
|
||||
exports org.apache.poi;
|
||||
exports org.apache.poi.common;
|
||||
|
|
Binary file not shown.
|
@ -28,8 +28,10 @@ module org.apache.poi.poi {
|
|||
requires jdk.unsupported;
|
||||
|
||||
uses org.apache.poi.ss.usermodel.WorkbookProvider;
|
||||
uses org.apache.poi.extractor.ExtractorProvider;
|
||||
|
||||
provides org.apache.poi.ss.usermodel.WorkbookProvider with org.apache.poi.hssf.usermodel.HSSFWorkbookFactory;
|
||||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.MainExtractorFactory;
|
||||
|
||||
exports org.apache.poi;
|
||||
exports org.apache.poi.common;
|
||||
|
|
Binary file not shown.
|
@ -20,6 +20,8 @@ module org.apache.poi.scratchpad {
|
|||
requires java.desktop;
|
||||
requires commons.math3;
|
||||
|
||||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
|
||||
|
||||
exports org.apache.poi.hmef;
|
||||
exports org.apache.poi.hmef.dev;
|
||||
exports org.apache.poi.hmef.extractor;
|
||||
|
|
Binary file not shown.
|
@ -20,6 +20,8 @@ module org.apache.poi.scratchpad {
|
|||
requires java.desktop;
|
||||
requires commons.math3;
|
||||
|
||||
provides org.apache.poi.extractor.ExtractorProvider with org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory;
|
||||
|
||||
exports org.apache.poi.hmef;
|
||||
exports org.apache.poi.hmef.dev;
|
||||
exports org.apache.poi.hmef.extractor;
|
||||
|
|
|
@ -18,15 +18,19 @@ package org.apache.poi.ooxml.extractor;
|
|||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
|
||||
/**
|
||||
* A command line wrapper around {@link ExtractorFactory}, useful
|
||||
* for when debugging.
|
||||
*/
|
||||
public class CommandLineTextExtractor {
|
||||
public final class CommandLineTextExtractor {
|
||||
public static final String DIVIDER = "=======================";
|
||||
|
||||
private CommandLineTextExtractor() {
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
|
|
|
@ -1,384 +0,0 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.ooxml.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.extractor.OLE2ExtractorFactory;
|
||||
import org.apache.poi.extractor.POIOLE2TextExtractor;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
|
||||
import org.apache.poi.poifs.crypt.Decryptor;
|
||||
import org.apache.poi.poifs.crypt.EncryptionInfo;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.FileMagic;
|
||||
import org.apache.poi.poifs.filesystem.NotOLE2FileException;
|
||||
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.util.IOUtils;
|
||||
import org.apache.poi.util.NotImplemented;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRelation;
|
||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
|
||||
/**
|
||||
* Figures out the correct POITextExtractor for your supplied
|
||||
* document, and returns it.
|
||||
*
|
||||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
|
||||
* not present on the runtime classpath</p>
|
||||
* <p>Note 2 - rather than using this, for most cases you would be better
|
||||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public final class ExtractorFactory {
|
||||
private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class);
|
||||
|
||||
public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
|
||||
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
|
||||
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
|
||||
|
||||
private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
|
||||
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
|
||||
XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
|
||||
XSLFRelation.PRESENTATION_MACRO
|
||||
};
|
||||
|
||||
private ExtractorFactory() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
*/
|
||||
public static boolean getThreadPrefersEventExtractors() {
|
||||
return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
*/
|
||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||
return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* Will only be used if the All Threads setting is null.
|
||||
*/
|
||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||
OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* If set, will take preference over the Thread level setting.
|
||||
*/
|
||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||
OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread use event based extractors is available?
|
||||
* Checks the all-threads one first, then thread specific.
|
||||
*/
|
||||
public static boolean getPreferEventExtractor() {
|
||||
return OLE2ExtractorFactory.getPreferEventExtractor();
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T extends POITextExtractor> T createExtractor(File f) throws IOException, OpenXML4JException, XmlException {
|
||||
POIFSFileSystem fs = null;
|
||||
try {
|
||||
fs = new POIFSFileSystem(f);
|
||||
if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
||||
return (T)createEncryptedOOXMLExtractor(fs);
|
||||
}
|
||||
POITextExtractor extractor = createExtractor(fs);
|
||||
extractor.setFilesystem(fs);
|
||||
return (T)extractor;
|
||||
} catch (OfficeXmlFileException e) {
|
||||
// ensure file-handle release
|
||||
IOUtils.closeQuietly(fs);
|
||||
OPCPackage pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
|
||||
T t = (T)createExtractor(pkg);
|
||||
t.setFilesystem(pkg);
|
||||
return t;
|
||||
} catch (NotOLE2FileException ne) {
|
||||
// ensure file-handle release
|
||||
IOUtils.closeQuietly(fs);
|
||||
throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file", ne);
|
||||
} catch (OpenXML4JException | Error | RuntimeException | IOException | XmlException e) { // NOSONAR
|
||||
// ensure file-handle release
|
||||
IOUtils.closeQuietly(fs);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException {
|
||||
InputStream is = FileMagic.prepareToCheckMagic(inp);
|
||||
|
||||
FileMagic fm = FileMagic.valueOf(is);
|
||||
|
||||
switch (fm) {
|
||||
case OLE2:
|
||||
POIFSFileSystem fs = new POIFSFileSystem(is);
|
||||
boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY);
|
||||
return isEncrypted ? createEncryptedOOXMLExtractor(fs) : createExtractor(fs);
|
||||
case OOXML:
|
||||
return createExtractor(OPCPackage.open(is));
|
||||
default:
|
||||
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream, found type: " + fm);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to determine the actual type of file and produces a matching text-extractor for it.
|
||||
*
|
||||
* @param pkg An {@link OPCPackage}.
|
||||
* @return A {@link POIXMLTextExtractor} for the given file.
|
||||
* @throws IOException If an error occurs while reading the file
|
||||
* @throws OpenXML4JException If an error parsing the OpenXML file format is found.
|
||||
* @throws XmlException If an XML parsing error occurs.
|
||||
* @throws IllegalArgumentException If no matching file type could be found.
|
||||
*/
|
||||
public static POITextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
|
||||
try {
|
||||
// Check for the normal Office core document
|
||||
PackageRelationshipCollection core;
|
||||
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
||||
|
||||
// If nothing was found, try some of the other OOXML-based core types
|
||||
if (core.size() == 0) {
|
||||
// Could it be an OOXML-Strict one?
|
||||
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
|
||||
}
|
||||
if (core.size() == 0) {
|
||||
// Could it be a visio one?
|
||||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
||||
if (core.size() == 1)
|
||||
return new XDGFVisioExtractor(pkg);
|
||||
}
|
||||
|
||||
// Should just be a single core document, complain if not
|
||||
if (core.size() != 1) {
|
||||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
||||
}
|
||||
|
||||
// Grab the core document part, and try to identify from that
|
||||
final PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
||||
final String contentType = corePart.getContentType();
|
||||
|
||||
// Is it XSSF?
|
||||
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
|
||||
if ( rel.getContentType().equals( contentType ) ) {
|
||||
if (getPreferEventExtractor()) {
|
||||
return new XSSFEventBasedExcelExtractor(pkg);
|
||||
}
|
||||
return new XSSFExcelExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XWPF?
|
||||
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
|
||||
if ( rel.getContentType().equals( contentType ) ) {
|
||||
return new XWPFWordExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XSLF?
|
||||
for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
|
||||
if ( rel.getContentType().equals( contentType ) ) {
|
||||
return new SlideShowExtractor<>(new XMLSlideShow(pkg));
|
||||
}
|
||||
}
|
||||
|
||||
// special handling for SlideShow-Theme-files,
|
||||
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
|
||||
return new SlideShowExtractor<>(new XMLSlideShow(pkg));
|
||||
}
|
||||
|
||||
// How about xlsb?
|
||||
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
|
||||
if (rel.getContentType().equals(contentType)) {
|
||||
return new XSSFBEventBasedExcelExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")");
|
||||
|
||||
} catch (IOException | Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
|
||||
// ensure that we close the package again if there is an error opening it, however
|
||||
// we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor!
|
||||
pkg.revert();
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public static <T extends POITextExtractor> T createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||
return createExtractor(fs.getRoot());
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T extends POITextExtractor> T createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException
|
||||
{
|
||||
// First, check for OOXML
|
||||
for (String entryName : poifsDir.getEntryNames()) {
|
||||
if (entryName.equals("Package")) {
|
||||
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
||||
return (T)createExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// If not, ask the OLE2 code to check, with Scratchpad if possible
|
||||
return (T)OLE2ExtractorFactory.createExtractor(poifsDir);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embedded documents in the file (if there are any).
|
||||
* If there are no embedded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embedded file.
|
||||
*/
|
||||
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
|
||||
// All the embedded directories we spotted
|
||||
ArrayList<Entry> dirs = new ArrayList<>();
|
||||
// For anything else not directly held in as a POIFS directory
|
||||
ArrayList<InputStream> nonPOIFS = new ArrayList<>();
|
||||
|
||||
// Find all the embedded directories
|
||||
DirectoryEntry root = ext.getRoot();
|
||||
if (root == null) {
|
||||
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
|
||||
}
|
||||
|
||||
// provide ExcelExtractor also in OOXML module, because scratchpad is not necessary for it
|
||||
if (ext instanceof ExcelExtractor) {
|
||||
// These are in MBD... under the root
|
||||
Iterator<Entry> it = root.getEntries();
|
||||
while (it.hasNext()) {
|
||||
Entry entry = it.next();
|
||||
if (entry.getName().startsWith("MBD")) {
|
||||
dirs.add(entry);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
Class<?> clazz = Class.forName("org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory");
|
||||
Method m = clazz.getDeclaredMethod("identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
|
||||
m.invoke(null, ext, dirs, nonPOIFS);
|
||||
} catch (ReflectiveOperationException e) {
|
||||
logger.log(POILogger.WARN, "POI Scratchpad jar not included ", e.getLocalizedMessage());
|
||||
return new POITextExtractor[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Create the extractors
|
||||
if (dirs.size() == 0 && nonPOIFS.size() == 0){
|
||||
return new POITextExtractor[0];
|
||||
}
|
||||
|
||||
ArrayList<POITextExtractor> textExtractors = new ArrayList<>();
|
||||
for (Entry dir : dirs) {
|
||||
textExtractors.add(createExtractor((DirectoryNode) dir));
|
||||
}
|
||||
for (InputStream nonPOIF : nonPOIFS) {
|
||||
try {
|
||||
textExtractors.add(createExtractor(nonPOIF));
|
||||
} catch (IllegalArgumentException e) {
|
||||
// Ignore, just means it didn't contain
|
||||
// a format we support as yet
|
||||
logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
|
||||
} catch (XmlException | OpenXML4JException e) {
|
||||
throw new IOException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
return textExtractors.toArray(new POITextExtractor[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of text extractors, one for each of
|
||||
* the embedded documents in the file (if there are any).
|
||||
* If there are no embedded documents, you'll get back an
|
||||
* empty array. Otherwise, you'll get one open
|
||||
* {@link POITextExtractor} for each embedded file.
|
||||
*/
|
||||
@NotImplemented
|
||||
@SuppressWarnings({"UnusedParameters", "UnusedReturnValue"})
|
||||
public static POITextExtractor[] getEmbeddedDocsTextExtractors(POIXMLTextExtractor ext) {
|
||||
throw new IllegalStateException("Not yet supported");
|
||||
}
|
||||
|
||||
private static POITextExtractor createEncryptedOOXMLExtractor(POIFSFileSystem fs)
|
||||
throws IOException {
|
||||
String pass = Biff8EncryptionKey.getCurrentUserPassword();
|
||||
if (pass == null) {
|
||||
pass = Decryptor.DEFAULT_PASSWORD;
|
||||
}
|
||||
|
||||
EncryptionInfo ei = new EncryptionInfo(fs);
|
||||
Decryptor dec = ei.getDecryptor();
|
||||
InputStream is = null;
|
||||
try {
|
||||
if (!dec.verifyPassword(pass)) {
|
||||
throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor");
|
||||
}
|
||||
is = dec.getDataStream(fs);
|
||||
return createExtractor(OPCPackage.open(is));
|
||||
} catch (IOException e) {
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
throw new EncryptedDocumentException(e);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(is);
|
||||
|
||||
// also close the POIFSFileSystem here as we read all the data
|
||||
// while decrypting
|
||||
fs.close();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,281 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.ooxml.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.ExtractorProvider;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.openxml4j.opc.PackagePart;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
|
||||
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
|
||||
import org.apache.poi.poifs.crypt.Decryptor;
|
||||
import org.apache.poi.poifs.crypt.EncryptionInfo;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.FileMagic;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
|
||||
import org.apache.poi.xslf.extractor.XSLFExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFRelation;
|
||||
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||
import org.apache.poi.xssf.usermodel.XSSFRelation;
|
||||
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFRelation;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
|
||||
/**
|
||||
* Figures out the correct POITextExtractor for your supplied
|
||||
* document, and returns it.
|
||||
*
|
||||
* <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is
|
||||
* not present on the runtime classpath</p>
|
||||
* <p>Note 2 - rather than using this, for most cases you would be better
|
||||
* off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p>
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public final class POIXMLExtractorFactory implements ExtractorProvider {
|
||||
private static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT;
|
||||
private static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
|
||||
private static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
|
||||
|
||||
private static final XSLFRelation[] SUPPORTED_XSLF_TYPES = new XSLFRelation[]{
|
||||
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
|
||||
XSLFRelation.PRESENTATIONML, XSLFRelation.PRESENTATIONML_TEMPLATE,
|
||||
XSLFRelation.PRESENTATION_MACRO
|
||||
};
|
||||
|
||||
@Override
|
||||
public boolean accepts(FileMagic fm) {
|
||||
return fm == FileMagic.OOXML;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is false.
|
||||
*/
|
||||
public static boolean getThreadPrefersEventExtractors() {
|
||||
return ExtractorFactory.getThreadPrefersEventExtractors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||
* Default is to use the thread level setting, which defaults to false.
|
||||
*/
|
||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||
return ExtractorFactory.getAllThreadsPreferEventExtractors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread prefer event based over usermodel based extractors?
|
||||
* Will only be used if the All Threads setting is null.
|
||||
*/
|
||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||
ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should all threads prefer event based over usermodel based extractors?
|
||||
* If set, will take preference over the Thread level setting.
|
||||
*/
|
||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||
ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should this thread use event based extractors is available?
|
||||
* Checks the all-threads one first, then thread specific.
|
||||
*/
|
||||
public static boolean getPreferEventExtractor() {
|
||||
return ExtractorFactory.getPreferEventExtractor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor create(File f, String password) throws IOException {
|
||||
if (FileMagic.valueOf(f) != FileMagic.OOXML) {
|
||||
return ExtractorFactory.createExtractor(f, password);
|
||||
}
|
||||
|
||||
|
||||
OPCPackage pkg = null;
|
||||
try {
|
||||
pkg = OPCPackage.open(f.toString(), PackageAccess.READ);
|
||||
POIXMLTextExtractor ex = create(pkg);
|
||||
if (ex == null) {
|
||||
pkg.revert();
|
||||
}
|
||||
return ex;
|
||||
} catch (InvalidFormatException ife) {
|
||||
throw new IOException(ife);
|
||||
} catch (IOException e) {
|
||||
pkg.revert();
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public POITextExtractor create(InputStream inp, String password) throws IOException {
|
||||
InputStream is = FileMagic.prepareToCheckMagic(inp);
|
||||
|
||||
if (FileMagic.valueOf(is) != FileMagic.OOXML) {
|
||||
return ExtractorFactory.createExtractor(is, password);
|
||||
}
|
||||
|
||||
OPCPackage pkg = null;
|
||||
try {
|
||||
pkg = OPCPackage.open(is);
|
||||
POIXMLTextExtractor ex = create(pkg);
|
||||
if (ex == null) {
|
||||
pkg.revert();
|
||||
}
|
||||
return ex;
|
||||
} catch (InvalidFormatException e) {
|
||||
throw new IOException(e);
|
||||
} catch (RuntimeException | IOException e) {
|
||||
if (pkg != null) {
|
||||
pkg.revert();
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to determine the actual type of file and produces a matching text-extractor for it.
|
||||
*
|
||||
* @param pkg An {@link OPCPackage}.
|
||||
* @return A {@link POIXMLTextExtractor} for the given file.
|
||||
* @throws IOException If an error occurs while reading the file
|
||||
* @throws IllegalArgumentException If no matching file type could be found.
|
||||
*/
|
||||
public POIXMLTextExtractor create(OPCPackage pkg) throws IOException {
|
||||
try {
|
||||
// Check for the normal Office core document
|
||||
PackageRelationshipCollection core;
|
||||
core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
|
||||
|
||||
// If nothing was found, try some of the other OOXML-based core types
|
||||
if (core.size() == 0) {
|
||||
// Could it be an OOXML-Strict one?
|
||||
core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL);
|
||||
}
|
||||
if (core.size() == 0) {
|
||||
// Could it be a visio one?
|
||||
core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL);
|
||||
if (core.size() == 1) {
|
||||
return new XDGFVisioExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Should just be a single core document, complain if not
|
||||
if (core.size() != 1) {
|
||||
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
|
||||
}
|
||||
|
||||
// Grab the core document part, and try to identify from that
|
||||
final PackagePart corePart = pkg.getPart(core.getRelationship(0));
|
||||
final String contentType = corePart.getContentType();
|
||||
|
||||
// Is it XSSF?
|
||||
for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
|
||||
if (rel.getContentType().equals(contentType)) {
|
||||
if (getPreferEventExtractor()) {
|
||||
return new XSSFEventBasedExcelExtractor(pkg);
|
||||
}
|
||||
return new XSSFExcelExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XWPF?
|
||||
for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
|
||||
if (rel.getContentType().equals(contentType)) {
|
||||
return new XWPFWordExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
// Is it XSLF?
|
||||
for (XSLFRelation rel : SUPPORTED_XSLF_TYPES) {
|
||||
if (rel.getContentType().equals(contentType)) {
|
||||
return new XSLFExtractor(new XMLSlideShow(pkg));
|
||||
}
|
||||
}
|
||||
|
||||
// special handling for SlideShow-Theme-files,
|
||||
if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) {
|
||||
return new XSLFExtractor(new XMLSlideShow(pkg));
|
||||
}
|
||||
|
||||
// How about xlsb?
|
||||
for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) {
|
||||
if (rel.getContentType().equals(contentType)) {
|
||||
return new XSSFBEventBasedExcelExtractor(pkg);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (IOException e) {
|
||||
throw e;
|
||||
} catch (Error | RuntimeException | XmlException | OpenXML4JException e) { // NOSONAR
|
||||
throw new IOException(e);
|
||||
}
|
||||
// we used to close (revert()) the package here, but this is the callers responsibility
|
||||
// and we can't reuse the package
|
||||
}
|
||||
|
||||
public POITextExtractor create(POIFSFileSystem fs) throws IOException {
|
||||
return create(fs.getRoot(), Biff8EncryptionKey.getCurrentUserPassword());
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
|
||||
// First, check for plain OOXML package
|
||||
if (poifsDir.hasEntry("Package")) {
|
||||
try (InputStream is = poifsDir.createDocumentInputStream("Package")) {
|
||||
return create(is, password);
|
||||
}
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) {
|
||||
EncryptionInfo ei = new EncryptionInfo(poifsDir);
|
||||
Decryptor dec = ei.getDecryptor();
|
||||
try {
|
||||
if (!dec.verifyPassword(password)) {
|
||||
throw new IOException("Invalid password specified");
|
||||
}
|
||||
try (InputStream is = dec.getDataStream(poifsDir)) {
|
||||
return create(is, password);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
throw new IOException("The OLE2 file neither contained a plain OOXML package node (\"Package\") nor an encrypted one (\"EncryptedPackage\").");
|
||||
}
|
||||
}
|
|
@ -36,9 +36,10 @@ import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProper
|
|||
* content of the OOXML file properties, eg author
|
||||
* and title.
|
||||
*/
|
||||
public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
|
||||
|
||||
public class POIXMLPropertiesTextExtractor implements POIXMLTextExtractor {
|
||||
private final POIXMLDocument doc;
|
||||
private final DateFormat dateFormat;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
/**
|
||||
* Creates a new POIXMLPropertiesTextExtractor for the given open document.
|
||||
|
@ -46,7 +47,7 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
|
|||
* @param doc the given open document
|
||||
*/
|
||||
public POIXMLPropertiesTextExtractor(POIXMLDocument doc) {
|
||||
super(doc);
|
||||
this.doc = doc;
|
||||
DateFormatSymbols dfs = DateFormatSymbols.getInstance(Locale.ROOT);
|
||||
dateFormat = new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", dfs);
|
||||
dateFormat.setTimeZone(LocaleUtil.TIMEZONE_UTC);
|
||||
|
@ -281,4 +282,24 @@ public class POIXMLPropertiesTextExtractor extends POIXMLTextExtractor {
|
|||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
|
||||
throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
|
||||
}
|
||||
|
||||
@Override
|
||||
public POIXMLDocument getDocument() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public POIXMLDocument getFilesystem() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,42 +27,30 @@ import org.apache.poi.ooxml.POIXMLProperties.ExtendedProperties;
|
|||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.util.ZipSecureFile;
|
||||
|
||||
public abstract class POIXMLTextExtractor extends POITextExtractor {
|
||||
/** The POIXMLDocument that's open */
|
||||
private final POIXMLDocument _document;
|
||||
|
||||
/**
|
||||
* Creates a new text extractor for the given document
|
||||
*
|
||||
* @param document the document to extract from
|
||||
*/
|
||||
public POIXMLTextExtractor(POIXMLDocument document) {
|
||||
_document = document;
|
||||
}
|
||||
|
||||
public interface POIXMLTextExtractor extends POITextExtractor {
|
||||
/**
|
||||
* Returns the core document properties
|
||||
*
|
||||
* @return the core document properties
|
||||
*/
|
||||
public CoreProperties getCoreProperties() {
|
||||
return _document.getProperties().getCoreProperties();
|
||||
default CoreProperties getCoreProperties() {
|
||||
return getDocument().getProperties().getCoreProperties();
|
||||
}
|
||||
/**
|
||||
* Returns the extended document properties
|
||||
*
|
||||
* @return the extended document properties
|
||||
*/
|
||||
public ExtendedProperties getExtendedProperties() {
|
||||
return _document.getProperties().getExtendedProperties();
|
||||
default ExtendedProperties getExtendedProperties() {
|
||||
return getDocument().getProperties().getExtendedProperties();
|
||||
}
|
||||
/**
|
||||
* Returns the custom document properties
|
||||
*
|
||||
* @return the custom document properties
|
||||
*/
|
||||
public CustomProperties getCustomProperties() {
|
||||
return _document.getProperties().getCustomProperties();
|
||||
default CustomProperties getCustomProperties() {
|
||||
return getDocument().getProperties().getCustomProperties();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -71,17 +59,16 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
|||
* @return the opened document
|
||||
*/
|
||||
@Override
|
||||
public final POIXMLDocument getDocument() {
|
||||
return _document;
|
||||
}
|
||||
POIXMLDocument getDocument();
|
||||
|
||||
/**
|
||||
* Returns the opened OPCPackage that contains the document
|
||||
*
|
||||
* @return the opened OPCPackage
|
||||
*/
|
||||
public OPCPackage getPackage() {
|
||||
return _document.getPackage();
|
||||
default OPCPackage getPackage() {
|
||||
POIXMLDocument doc = getDocument();
|
||||
return doc != null ? doc.getPackage() : null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -89,25 +76,24 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
|
|||
* document properties metadata, such as title and author.
|
||||
*/
|
||||
@Override
|
||||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
|
||||
return new POIXMLPropertiesTextExtractor(_document);
|
||||
default POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
|
||||
return new POIXMLPropertiesTextExtractor(getDocument());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
default void close() throws IOException {
|
||||
// e.g. XSSFEventBaseExcelExtractor passes a null-document
|
||||
if(_document != null) {
|
||||
if (isCloseFilesystem()) {
|
||||
@SuppressWarnings("resource")
|
||||
OPCPackage pkg = _document.getPackage();
|
||||
if(pkg != null) {
|
||||
OPCPackage pkg = getPackage();
|
||||
if (pkg != null) {
|
||||
// revert the package to not re-write the file, which is very likely not wanted for a TextExtractor!
|
||||
pkg.revert();
|
||||
}
|
||||
}
|
||||
super.close();
|
||||
}
|
||||
|
||||
protected void checkMaxTextSize(CharSequence text, String string) {
|
||||
default void checkMaxTextSize(CharSequence text, String string) {
|
||||
if(string == null) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.poi.xdgf.extractor;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLDocument;
|
||||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.xdgf.usermodel.XDGFPage;
|
||||
|
@ -28,12 +27,12 @@ import org.apache.poi.xdgf.usermodel.shape.ShapeTextVisitor;
|
|||
/**
|
||||
* Helper class to extract text from an OOXML Visio File
|
||||
*/
|
||||
public class XDGFVisioExtractor extends POIXMLTextExtractor {
|
||||
public class XDGFVisioExtractor implements POIXMLTextExtractor {
|
||||
|
||||
protected final XmlVisioDocument document;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public XDGFVisioExtractor(XmlVisioDocument document) {
|
||||
super(document);
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
|
@ -51,17 +50,23 @@ public class XDGFVisioExtractor extends POIXMLTextExtractor {
|
|||
return visitor.getText();
|
||||
}
|
||||
|
||||
public static void main(String [] args) throws IOException {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" XDGFVisioExtractor <filename.vsdx>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new XDGFVisioExtractor(POIXMLDocument.openPackage(
|
||||
args[0]
|
||||
));
|
||||
System.out.println(extractor.getText());
|
||||
extractor.close();
|
||||
@Override
|
||||
public XmlVisioDocument getDocument() {
|
||||
return document;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XmlVisioDocument getFilesystem() {
|
||||
return document;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.xslf.extractor;
|
||||
|
||||
import org.apache.poi.ooxml.extractor.POIXMLPropertiesTextExtractor;
|
||||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFShape;
|
||||
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
|
||||
|
||||
|
||||
/**
|
||||
* Helper class to extract text from an OOXML Powerpoint file
|
||||
*/
|
||||
public class XSLFExtractor extends SlideShowExtractor<XSLFShape, XSLFTextParagraph> implements POIXMLTextExtractor {
|
||||
public XSLFExtractor(XMLSlideShow slideshow) {
|
||||
super(slideshow);
|
||||
}
|
||||
|
||||
@Override
|
||||
public XMLSlideShow getDocument() {
|
||||
return (XMLSlideShow)slideshow;
|
||||
}
|
||||
|
||||
@Override
|
||||
public POIXMLPropertiesTextExtractor getMetadataTextExtractor() {
|
||||
return POIXMLTextExtractor.super.getMetadataTextExtractor();
|
||||
}
|
||||
}
|
|
@ -19,7 +19,6 @@ package org.apache.poi.xssf.extractor;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.ss.usermodel.DataFormatter;
|
||||
|
@ -43,8 +42,7 @@ import org.xml.sax.SAXException;
|
|||
*
|
||||
* @since 3.16-beta3
|
||||
*/
|
||||
public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor
|
||||
implements org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor {
|
||||
|
||||
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFBEventBasedExcelExtractor.class);
|
||||
|
||||
|
@ -62,18 +60,6 @@ public class XSSFBEventBasedExcelExtractor extends XSSFEventBasedExcelExtractor
|
|||
super(container);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" XSSFBEventBasedExcelExtractor <filename.xlsb>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new XSSFBEventBasedExcelExtractor(args[0]);
|
||||
System.out.println(extractor.getText());
|
||||
extractor.close();
|
||||
}
|
||||
|
||||
public void setHandleHyperlinksInCells(boolean handleHyperlinksInCells) {
|
||||
this.handleHyperlinksInCells = handleHyperlinksInCells;
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.util.Map;
|
|||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLDocument;
|
||||
import org.apache.poi.ooxml.POIXMLProperties;
|
||||
import org.apache.poi.ooxml.POIXMLProperties.CoreProperties;
|
||||
import org.apache.poi.ooxml.POIXMLProperties.CustomProperties;
|
||||
|
@ -57,13 +58,13 @@ import org.xml.sax.XMLReader;
|
|||
* Implementation of a text extractor from OOXML Excel
|
||||
* files that uses SAX event based parsing.
|
||||
*/
|
||||
public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
||||
implements org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
public class XSSFEventBasedExcelExtractor
|
||||
implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
|
||||
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class);
|
||||
|
||||
protected OPCPackage container;
|
||||
protected POIXMLProperties properties;
|
||||
protected final OPCPackage container;
|
||||
protected final POIXMLProperties properties;
|
||||
|
||||
protected Locale locale;
|
||||
protected boolean includeTextBoxes = true;
|
||||
|
@ -73,29 +74,17 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
|||
protected boolean formulasNotResults;
|
||||
protected boolean concatenatePhoneticRuns = true;
|
||||
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
|
||||
this(OPCPackage.open(path));
|
||||
}
|
||||
|
||||
public XSSFEventBasedExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
|
||||
super(null);
|
||||
this.container = container;
|
||||
|
||||
properties = new POIXMLProperties(container);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" XSSFEventBasedExcelExtractor <filename.xlsx>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new XSSFEventBasedExcelExtractor(args[0]);
|
||||
System.out.println(extractor.getText());
|
||||
extractor.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should sheet names be included? Default is true
|
||||
*/
|
||||
|
@ -319,12 +308,23 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
|
|||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (container != null) {
|
||||
container.close();
|
||||
container = null;
|
||||
}
|
||||
super.close();
|
||||
public POIXMLDocument getDocument() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OPCPackage getFilesystem() {
|
||||
return container;
|
||||
}
|
||||
|
||||
protected class SheetTextExtractor implements SheetContentsHandler {
|
||||
|
|
|
@ -20,8 +20,8 @@ import java.io.IOException;
|
|||
import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
|
@ -44,8 +44,8 @@ import org.apache.xmlbeans.XmlException;
|
|||
/**
|
||||
* Helper class to extract text from an OOXML Excel file
|
||||
*/
|
||||
public class XSSFExcelExtractor extends POIXMLTextExtractor
|
||||
implements org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
public class XSSFExcelExtractor
|
||||
implements POIXMLTextExtractor, org.apache.poi.ss.extractor.ExcelExtractor {
|
||||
public static final XSSFRelation[] SUPPORTED_TYPES = new XSSFRelation[] {
|
||||
XSSFRelation.WORKBOOK, XSSFRelation.MACRO_TEMPLATE_WORKBOOK,
|
||||
XSSFRelation.MACRO_ADDIN_WORKBOOK, XSSFRelation.TEMPLATE_WORKBOOK,
|
||||
|
@ -53,34 +53,21 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
|
|||
};
|
||||
|
||||
private Locale locale;
|
||||
private XSSFWorkbook workbook;
|
||||
private final XSSFWorkbook workbook;
|
||||
private boolean includeSheetNames = true;
|
||||
private boolean formulasNotResults;
|
||||
private boolean includeCellComments;
|
||||
private boolean includeHeadersFooters = true;
|
||||
private boolean includeTextBoxes = true;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public XSSFExcelExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
|
||||
this(new XSSFWorkbook(container));
|
||||
}
|
||||
public XSSFExcelExtractor(XSSFWorkbook workbook) {
|
||||
super(workbook);
|
||||
this.workbook = workbook;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" XSSFExcelExtractor <filename.xlsx>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
try (OPCPackage pkg = OPCPackage.create(args[0]);
|
||||
POIXMLTextExtractor extractor = new XSSFExcelExtractor(pkg)) {
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Should sheet names be included? Default is true
|
||||
*/
|
||||
|
@ -262,4 +249,24 @@ public class XSSFExcelExtractor extends POIXMLTextExtractor
|
|||
private String extractHeaderFooter(HeaderFooter hf) {
|
||||
return ExcelExtractor._extractHeaderFooter(hf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public XSSFWorkbook getDocument() {
|
||||
return workbook;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XSSFWorkbook getFilesystem() {
|
||||
return workbook;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,9 +19,7 @@ package org.apache.poi.xwpf.extractor;
|
|||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.ooxml.POIXMLDocument;
|
||||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
|
||||
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
|
||||
|
@ -39,46 +37,31 @@ import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
|
|||
import org.apache.poi.xwpf.usermodel.XWPFTable;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
|
||||
|
||||
/**
|
||||
* Helper class to extract text from an OOXML Word file
|
||||
*/
|
||||
public class XWPFWordExtractor extends POIXMLTextExtractor {
|
||||
public class XWPFWordExtractor implements POIXMLTextExtractor {
|
||||
public static final XWPFRelation[] SUPPORTED_TYPES = {
|
||||
XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE,
|
||||
XWPFRelation.MACRO_DOCUMENT,
|
||||
XWPFRelation.MACRO_TEMPLATE_DOCUMENT
|
||||
};
|
||||
|
||||
private XWPFDocument document;
|
||||
private final XWPFDocument document;
|
||||
private boolean fetchHyperlinks;
|
||||
private boolean concatenatePhoneticRuns = true;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public XWPFWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
|
||||
public XWPFWordExtractor(OPCPackage container) throws IOException {
|
||||
this(new XWPFDocument(container));
|
||||
}
|
||||
|
||||
public XWPFWordExtractor(XWPFDocument document) {
|
||||
super(document);
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" XWPFWordExtractor <filename.docx>");
|
||||
System.exit(1);
|
||||
}
|
||||
POIXMLTextExtractor extractor =
|
||||
new XWPFWordExtractor(POIXMLDocument.openPackage(
|
||||
args[0]
|
||||
));
|
||||
System.out.println(extractor.getText());
|
||||
extractor.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we also fetch the hyperlinks, when fetching
|
||||
* the text content? Default is to only output the
|
||||
|
@ -217,4 +200,24 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
|
|||
text.append(hfPolicy.getDefaultHeader().getText());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public XWPFDocument getDocument() {
|
||||
return document;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XWPFDocument getFilesystem() {
|
||||
return document;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,23 +31,25 @@ import java.util.Locale;
|
|||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.UnsupportedFileFormatException;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.POIOLE2TextExtractor;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
import org.apache.poi.hssf.OldExcelFormatException;
|
||||
import org.apache.poi.hssf.extractor.EventBasedExcelExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
|
||||
import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||
import org.apache.poi.openxml4j.opc.PackageAccess;
|
||||
import org.apache.poi.poifs.filesystem.FileMagic;
|
||||
import org.apache.poi.poifs.filesystem.NotOLE2FileException;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
|
||||
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
/**
|
||||
* Test that the extractor factory plays nicely
|
||||
|
@ -89,6 +91,8 @@ public class TestExtractorFactory {
|
|||
private static POIDataSamples pubTests = POIDataSamples.getPublisherInstance();
|
||||
private static File pub = getFileAndCheck(pubTests, "Simple.pub");
|
||||
|
||||
private static final POIXMLExtractorFactory xmlFactory = new POIXMLExtractorFactory();
|
||||
|
||||
private static File getFileAndCheck(POIDataSamples samples, String name) {
|
||||
File file = samples.getFile(name);
|
||||
|
||||
|
@ -110,7 +114,7 @@ public class TestExtractorFactory {
|
|||
"Word 6", doc6, "Word6Extractor", 20,
|
||||
"Word 95", doc95, "Word6Extractor", 120,
|
||||
"PowerPoint", ppt, "SlideShowExtractor", 120,
|
||||
"PowerPoint - pptx", pptx, "SlideShowExtractor", 120,
|
||||
"PowerPoint - pptx", pptx, "XSLFExtractor", 120,
|
||||
"Visio", vsd, "VisioTextExtractor", 50,
|
||||
"Visio - vsdx", vsdx, "XDGFVisioExtractor", 20,
|
||||
"Publisher", pub, "PublisherTextExtractor", 50,
|
||||
|
@ -125,6 +129,8 @@ public class TestExtractorFactory {
|
|||
R apply(T t) throws IOException, OpenXML4JException, XmlException;
|
||||
}
|
||||
|
||||
@Rule
|
||||
public ExpectedException thrown = ExpectedException.none();
|
||||
|
||||
@Test
|
||||
public void testFile() throws Exception {
|
||||
|
@ -135,12 +141,12 @@ public class TestExtractorFactory {
|
|||
}
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
@Test
|
||||
public void testFileInvalid() throws Exception {
|
||||
thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
|
||||
thrown.expect(IOException.class);
|
||||
// Text
|
||||
try (POITextExtractor ignored = ExtractorFactory.createExtractor(txt)) {
|
||||
fail("extracting from invalid package");
|
||||
}
|
||||
ExtractorFactory.createExtractor(txt);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -148,8 +154,10 @@ public class TestExtractorFactory {
|
|||
testStream(ExtractorFactory::createExtractor, true);
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
@Test
|
||||
public void testInputStreamInvalid() throws Exception {
|
||||
thrown.expectMessage("Can't create extractor - unsupported file type: UNKNOWN");
|
||||
thrown.expect(IOException.class);
|
||||
testInvalid(ExtractorFactory::createExtractor);
|
||||
}
|
||||
|
||||
|
@ -158,8 +166,10 @@ public class TestExtractorFactory {
|
|||
testStream((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)), false);
|
||||
}
|
||||
|
||||
@Test(expected = IOException.class)
|
||||
@Test
|
||||
public void testPOIFSInvalid() throws Exception {
|
||||
thrown.expectMessage("Invalid header signature; read 0x3D20726F68747541, expected 0xE11AB1A1E011CFD0");
|
||||
thrown.expect(NotOLE2FileException.class);
|
||||
testInvalid((f) -> ExtractorFactory.createExtractor(new POIFSFileSystem(f)));
|
||||
}
|
||||
|
||||
|
@ -195,9 +205,7 @@ public class TestExtractorFactory {
|
|||
POITextExtractor ignored = poifs.apply(fis)) {
|
||||
fail("extracting from invalid package");
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertTrue("Had: " + e,
|
||||
e.getMessage().contains(FileMagic.UNKNOWN.name()));
|
||||
|
||||
assertTrue("Had: " + e, e.getMessage().contains(FileMagic.UNKNOWN.name()));
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
@ -211,7 +219,7 @@ public class TestExtractorFactory {
|
|||
}
|
||||
|
||||
try (final OPCPackage pkg = OPCPackage.open(testFile, PackageAccess.READ);
|
||||
final POITextExtractor ext = ExtractorFactory.createExtractor(pkg)) {
|
||||
final POITextExtractor ext = xmlFactory.create(pkg)) {
|
||||
testExtractor(ext, (String) TEST_SET[i], (String) TEST_SET[i + 2], (Integer) TEST_SET[i + 3]);
|
||||
pkg.revert();
|
||||
}
|
||||
|
@ -222,7 +230,7 @@ public class TestExtractorFactory {
|
|||
public void testPackageInvalid() throws Exception {
|
||||
// Text
|
||||
try (final OPCPackage pkg = OPCPackage.open(txt, PackageAccess.READ);
|
||||
final POITextExtractor ignored = ExtractorFactory.createExtractor(pkg)) {
|
||||
final POITextExtractor ignored = xmlFactory.create(pkg)) {
|
||||
fail("extracting from invalid package");
|
||||
}
|
||||
}
|
||||
|
@ -251,61 +259,45 @@ public class TestExtractorFactory {
|
|||
assertTrue(ExtractorFactory.getThreadPrefersEventExtractors());
|
||||
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||
|
||||
try {
|
||||
// Check we get the right extractors now
|
||||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
|
||||
assertTrue(extractor instanceof EventBasedExcelExtractor);
|
||||
}
|
||||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
|
||||
assertTrue(extractor.getText().length() > 200);
|
||||
}
|
||||
|
||||
// Check we get the right extractors now
|
||||
POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof EventBasedExcelExtractor
|
||||
);
|
||||
extractor.close();
|
||||
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
||||
assertTrue(
|
||||
extractor.getText().length() > 200
|
||||
);
|
||||
extractor.close();
|
||||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
|
||||
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
|
||||
}
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
||||
assertTrue(extractor instanceof XSSFEventBasedExcelExtractor);
|
||||
extractor.close();
|
||||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
|
||||
assertTrue(extractor.getText().length() > 200);
|
||||
}
|
||||
} finally {
|
||||
// Put back to normal
|
||||
ExtractorFactory.setThreadPrefersEventExtractors(false);
|
||||
}
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
||||
assertTrue(
|
||||
extractor.getText().length() > 200
|
||||
);
|
||||
extractor.close();
|
||||
|
||||
|
||||
// Put back to normal
|
||||
ExtractorFactory.setThreadPrefersEventExtractors(false);
|
||||
assertFalse(ExtractorFactory.getPreferEventExtractor());
|
||||
assertFalse(ExtractorFactory.getThreadPrefersEventExtractors());
|
||||
assertNull(ExtractorFactory.getAllThreadsPreferEventExtractors());
|
||||
|
||||
// And back
|
||||
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof ExcelExtractor
|
||||
);
|
||||
extractor.close();
|
||||
extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)));
|
||||
assertTrue(
|
||||
extractor.getText().length() > 200
|
||||
);
|
||||
extractor.close();
|
||||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
|
||||
assertTrue(extractor instanceof ExcelExtractor);
|
||||
}
|
||||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(new POIFSFileSystem(new FileInputStream(xls)))) {
|
||||
assertTrue(extractor.getText().length() > 200);
|
||||
}
|
||||
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString(), PackageAccess.READ));
|
||||
assertTrue(
|
||||
extractor
|
||||
instanceof XSSFExcelExtractor
|
||||
);
|
||||
extractor.close();
|
||||
extractor = ExtractorFactory.createExtractor(OPCPackage.open(xlsx.toString()));
|
||||
assertTrue(
|
||||
extractor.getText().length() > 200
|
||||
);
|
||||
extractor.close();
|
||||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString(), PackageAccess.READ))) {
|
||||
assertTrue(extractor instanceof XSSFExcelExtractor);
|
||||
}
|
||||
try (POITextExtractor extractor = xmlFactory.create(OPCPackage.open(xlsx.toString()))) {
|
||||
assertTrue(extractor.getText().length() > 200);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -325,7 +317,7 @@ public class TestExtractorFactory {
|
|||
};
|
||||
|
||||
for (int i=0; i<testObj.length; i+=3) {
|
||||
try (final POIOLE2TextExtractor ext = ExtractorFactory.createExtractor((File)testObj[i+1])) {
|
||||
try (final POIOLE2TextExtractor ext = (POIOLE2TextExtractor)ExtractorFactory.createExtractor((File)testObj[i+1])) {
|
||||
final POITextExtractor[] embeds = ExtractorFactory.getEmbeddedDocsTextExtractors(ext);
|
||||
|
||||
int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX = 0;
|
||||
|
@ -463,16 +455,17 @@ public class TestExtractorFactory {
|
|||
* #59074 - Excel 95 files should give a helpful message, not just
|
||||
* "No supported documents found in the OLE2 stream"
|
||||
*/
|
||||
@Test(expected = OldExcelFormatException.class)
|
||||
public void bug59074() throws Exception {
|
||||
ExtractorFactory.createExtractor(
|
||||
POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"));
|
||||
try (POITextExtractor extractor = ExtractorFactory.createExtractor(POIDataSamples.getSpreadSheetInstance().getFile("59074.xls"))) {
|
||||
String text = extractor.getText();
|
||||
assertContains(text, "testdoc");
|
||||
}
|
||||
}
|
||||
|
||||
@Test(expected = IllegalStateException.class)
|
||||
public void testGetEmbeddedFromXMLExtractor() {
|
||||
public void testGetEmbeddedFromXMLExtractor() throws IOException {
|
||||
// currently not implemented
|
||||
ExtractorFactory.getEmbeddedDocsTextExtractors((POIXMLTextExtractor)null);
|
||||
ExtractorFactory.getEmbeddedDocsTextExtractors(null);
|
||||
}
|
||||
|
||||
// This bug is currently open. This test will fail with "expected error not thrown" when the bug has been fixed.
|
||||
|
|
|
@ -60,9 +60,9 @@ import org.apache.poi.EncryptedDocumentException;
|
|||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.POITestCase;
|
||||
import org.apache.poi.UnsupportedFileFormatException;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.ooxml.POIXMLException;
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.apache.poi.ooxml.util.DocumentHelper;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
|
||||
|
@ -836,7 +836,7 @@ public final class TestPackage {
|
|||
|
||||
@Test
|
||||
public void testZipEntityExpansionExceedsMemory() throws IOException, OpenXML4JException, XmlException {
|
||||
expectedEx.expect(POIXMLException.class);
|
||||
expectedEx.expect(IOException.class);
|
||||
expectedEx.expectMessage("unable to parse shared strings table");
|
||||
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than"));
|
||||
openXmlBombFile("poc-xmlbomb.xlsx");
|
||||
|
@ -844,7 +844,7 @@ public final class TestPackage {
|
|||
|
||||
@Test
|
||||
public void testZipEntityExpansionExceedsMemory2() throws IOException, OpenXML4JException, XmlException {
|
||||
expectedEx.expect(POIXMLException.class);
|
||||
expectedEx.expect(IOException.class);
|
||||
expectedEx.expectMessage("unable to parse shared strings table");
|
||||
expectedEx.expectCause(getCauseMatcher(SAXParseException.class, "The parser has encountered more than"));
|
||||
openXmlBombFile("poc-xmlbomb-empty.xlsx");
|
||||
|
|
|
@ -35,14 +35,12 @@ import java.util.Collection;
|
|||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.poifs.crypt.EncryptionInfo;
|
||||
import org.apache.poi.poifs.crypt.cryptoapi.CryptoAPIEncryptionHeader;
|
||||
import org.apache.poi.poifs.storage.RawDataUtil;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.Parameterized;
|
||||
|
@ -91,7 +89,7 @@ public class TestHxxFEncryption {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void extract() throws IOException, OpenXML4JException, XmlException {
|
||||
public void extract() throws IOException {
|
||||
File f = sampleDir.getFile(file);
|
||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||
try (POITextExtractor te = ExtractorFactory.createExtractor(f)) {
|
||||
|
@ -103,16 +101,16 @@ public class TestHxxFEncryption {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void changePassword() throws IOException, OpenXML4JException, XmlException {
|
||||
public void changePassword() throws IOException {
|
||||
newPassword("test");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void removePassword() throws IOException, OpenXML4JException, XmlException {
|
||||
public void removePassword() throws IOException {
|
||||
newPassword(null);
|
||||
}
|
||||
|
||||
private void newPassword(String newPass) throws IOException, OpenXML4JException, XmlException {
|
||||
private void newPassword(String newPass) throws IOException {
|
||||
File f = sampleDir.getFile(file);
|
||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||
try (POITextExtractor te1 = ExtractorFactory.createExtractor(f)) {
|
||||
|
@ -133,7 +131,7 @@ public class TestHxxFEncryption {
|
|||
|
||||
/** changing the encryption mode and key size in poor mans style - see comments below */
|
||||
@Test
|
||||
public void changeEncryption() throws IOException, OpenXML4JException, XmlException {
|
||||
public void changeEncryption() throws IOException {
|
||||
File f = sampleDir.getFile(file);
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||
|
@ -157,7 +155,7 @@ public class TestHxxFEncryption {
|
|||
POIDocument doc = (POIDocument) te3.getDocument()) {
|
||||
// need to cache data (i.e. read all data) before changing the key size
|
||||
Class<?> clazz = doc.getClass();
|
||||
if ("HSLFSlideShowImpl".equals(clazz.getSimpleName())) {
|
||||
if ("HSLFSlideShow".equals(clazz.getSimpleName())) {
|
||||
try {
|
||||
clazz.getDeclaredMethod("getPictureData").invoke(doc);
|
||||
} catch (ReflectiveOperationException e) {
|
||||
|
|
|
@ -522,7 +522,7 @@ public class TestXSLFBugs {
|
|||
private String getSlideText(XMLSlideShow ppt, XSLFSlide slide) throws IOException {
|
||||
try (SlideShowExtractor<XSLFShape,XSLFTextParagraph> extr = new SlideShowExtractor<>(ppt)) {
|
||||
// do not auto-close the slideshow
|
||||
extr.setFilesystem(null);
|
||||
extr.setCloseFilesystem(false);
|
||||
extr.setSlidesByDefault(true);
|
||||
extr.setNotesByDefault(false);
|
||||
extr.setMasterByDefault(false);
|
||||
|
|
|
@ -29,20 +29,18 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||
import org.apache.poi.xslf.usermodel.XSLFShape;
|
||||
import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
|
||||
import org.apache.xmlbeans.XmlException;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests for XSLFPowerPointExtractor
|
||||
*/
|
||||
public class TestXSLFPowerPointExtractor {
|
||||
private static POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||
private static final POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||
|
||||
/**
|
||||
* Get text out of the simple file
|
||||
|
@ -262,10 +260,11 @@ public class TestXSLFPowerPointExtractor {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void test45541() throws IOException, OpenXML4JException, XmlException {
|
||||
public void test45541() throws IOException {
|
||||
// extract text from a powerpoint that has a header in the notes-element
|
||||
final File headerFile = slTests.getFile("45541_Header.pptx");
|
||||
try (final SlideShowExtractor extr = ExtractorFactory.createExtractor(headerFile)) {
|
||||
//noinspection rawtypes
|
||||
try (final SlideShowExtractor extr = (SlideShowExtractor) ExtractorFactory.createExtractor(headerFile)) {
|
||||
String text = extr.getText();
|
||||
assertNotNull(text);
|
||||
assertFalse("Had: " + text, text.contains("testdoc"));
|
||||
|
@ -280,7 +279,8 @@ public class TestXSLFPowerPointExtractor {
|
|||
|
||||
// extract text from a powerpoint that has a footer in the master-slide
|
||||
final File footerFile = slTests.getFile("45541_Footer.pptx");
|
||||
try (SlideShowExtractor extr = ExtractorFactory.createExtractor(footerFile)) {
|
||||
//noinspection rawtypes
|
||||
try (SlideShowExtractor extr = (SlideShowExtractor)ExtractorFactory.createExtractor(footerFile)) {
|
||||
String text = extr.getText();
|
||||
assertNotContained(text, "testdoc");
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
==================================================================== */
|
||||
package org.apache.poi.xssf.extractor;
|
||||
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
import org.junit.After;
|
||||
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
|
||||
package org.apache.poi.xssf.extractor;
|
||||
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
import org.apache.poi.ooxml.extractor.ExtractorFactory;
|
||||
import org.junit.After;
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
# ====================================================================
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ====================================================================
|
||||
|
||||
org.apache.poi.extractor.MainExtractorFactory
|
|
@ -0,0 +1,18 @@
|
|||
# ====================================================================
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ====================================================================
|
||||
|
||||
org.apache.poi.ooxml.extractor.POIXMLExtractorFactory
|
|
@ -0,0 +1,18 @@
|
|||
# ====================================================================
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ====================================================================
|
||||
|
||||
org.apache.poi.extractor.ole2.OLE2ScratchpadExtractorFactory
|
|
@ -17,44 +17,66 @@
|
|||
package org.apache.poi.extractor.ole2;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.ExtractorProvider;
|
||||
import org.apache.poi.extractor.POIOLE2TextExtractor;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.extractor.OLE2ExtractorFactory;
|
||||
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
|
||||
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
|
||||
import org.apache.poi.hslf.usermodel.HSLFShape;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||
import org.apache.poi.hsmf.extractor.OutlookTextExtractor;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
|
||||
import org.apache.poi.hwpf.OldWordFileFormatException;
|
||||
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryEntry;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Entry;
|
||||
import org.apache.poi.poifs.filesystem.FileMagic;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* Scratchpad-specific logic for {@link OLE2ExtractorFactory} and
|
||||
* Scratchpad-specific logic for {@link ExtractorFactory} and
|
||||
* {@link org.apache.poi.extractor.ExtractorFactory}, which permit the other two to run with
|
||||
* no Scratchpad jar (though without functionality!)
|
||||
* <p>Note - should not be used standalone, always use via the other
|
||||
* two classes</p>
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public class OLE2ScratchpadExtractorFactory {
|
||||
public class OLE2ScratchpadExtractorFactory implements ExtractorProvider {
|
||||
private static final POILogger logger = POILogFactory.getLogger(OLE2ScratchpadExtractorFactory.class);
|
||||
|
||||
@Override
|
||||
public boolean accepts(FileMagic fm) {
|
||||
return FileMagic.OLE2 == fm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor create(File file, String password) throws IOException {
|
||||
return create(new POIFSFileSystem(file, true).getRoot(), password);
|
||||
}
|
||||
|
||||
@Override
|
||||
public POITextExtractor create(InputStream inputStream, String password) throws IOException {
|
||||
return create(new POIFSFileSystem(inputStream).getRoot(), password);
|
||||
}
|
||||
|
||||
/**
|
||||
* Look for certain entries in the stream, to figure it
|
||||
* out what format is desired
|
||||
|
@ -66,48 +88,54 @@ public class OLE2ScratchpadExtractorFactory {
|
|||
*
|
||||
* @throws IOException when the format specific extraction fails because of invalid entires
|
||||
*/
|
||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
|
||||
if (poifsDir.hasEntry("WordDocument")) {
|
||||
// Old or new style word document?
|
||||
try {
|
||||
return new WordExtractor(poifsDir);
|
||||
} catch (OldWordFileFormatException e) {
|
||||
return new Word6Extractor(poifsDir);
|
||||
public POITextExtractor create(DirectoryNode poifsDir, String password) throws IOException {
|
||||
final String oldPW = Biff8EncryptionKey.getCurrentUserPassword();
|
||||
try {
|
||||
Biff8EncryptionKey.setCurrentUserPassword(password);
|
||||
if (poifsDir.hasEntry("WordDocument")) {
|
||||
// Old or new style word document?
|
||||
try {
|
||||
return new WordExtractor(poifsDir);
|
||||
} catch (OldWordFileFormatException e) {
|
||||
return new Word6Extractor(poifsDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
|
||||
return new SlideShowExtractor(SlideShowFactory.create(poifsDir));
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("VisioDocument")) {
|
||||
return new VisioTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("Quill")) {
|
||||
return new PublisherTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
final String[] outlookEntryNames = new String[] {
|
||||
// message bodies, saved as plain text (PtypString)
|
||||
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
|
||||
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
|
||||
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
|
||||
// @see org.apache.poi.hsmf.Types.MAPIType
|
||||
"__substg1.0_1000001E", //PidTagBody ASCII
|
||||
"__substg1.0_1000001F", //PidTagBody Unicode
|
||||
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
|
||||
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
|
||||
"__substg1.0_0037001E", //PidTagSubject ASCII
|
||||
"__substg1.0_0037001F", //PidTagSubject Unicode
|
||||
};
|
||||
for (String entryName : outlookEntryNames) {
|
||||
if (poifsDir.hasEntry(entryName)) {
|
||||
return new OutlookTextExtractor(poifsDir);
|
||||
if (poifsDir.hasEntry(HSLFSlideShow.POWERPOINT_DOCUMENT)) {
|
||||
return new SlideShowExtractor<HSLFShape, HSLFTextParagraph>(SlideShowFactory.create(poifsDir));
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("VisioDocument")) {
|
||||
return new VisioTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
if (poifsDir.hasEntry("Quill")) {
|
||||
return new PublisherTextExtractor(poifsDir);
|
||||
}
|
||||
|
||||
final String[] outlookEntryNames = new String[]{
|
||||
// message bodies, saved as plain text (PtypString)
|
||||
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
|
||||
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
|
||||
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
|
||||
// @see org.apache.poi.hsmf.Types.MAPIType
|
||||
"__substg1.0_1000001E", //PidTagBody ASCII
|
||||
"__substg1.0_1000001F", //PidTagBody Unicode
|
||||
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
|
||||
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
|
||||
"__substg1.0_0037001E", //PidTagSubject ASCII
|
||||
"__substg1.0_0037001F", //PidTagSubject Unicode
|
||||
};
|
||||
for (String entryName : outlookEntryNames) {
|
||||
if (poifsDir.hasEntry(entryName)) {
|
||||
return new OutlookTextExtractor(poifsDir);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
Biff8EncryptionKey.setCurrentUserPassword(oldPW);
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -120,10 +148,9 @@ public class OLE2ScratchpadExtractorFactory {
|
|||
* @param ext the extractor holding the directory to start parsing
|
||||
* @param dirs a list to be filled with directory references holding embedded
|
||||
* @param nonPOIFS a list to be filled with streams which aren't based on POIFS entries
|
||||
*
|
||||
* @throws IOException when the format specific extraction fails because of invalid entires
|
||||
*/
|
||||
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
|
||||
@Override
|
||||
public void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) {
|
||||
// Find all the embedded directories
|
||||
DirectoryEntry root = ext.getRoot();
|
||||
if (root == null) {
|
||||
|
@ -132,25 +159,16 @@ public class OLE2ScratchpadExtractorFactory {
|
|||
|
||||
if (ext instanceof ExcelExtractor) {
|
||||
// These are in MBD... under the root
|
||||
Iterator<Entry> it = root.getEntries();
|
||||
while (it.hasNext()) {
|
||||
Entry entry = it.next();
|
||||
if (entry.getName().startsWith("MBD")) {
|
||||
dirs.add(entry);
|
||||
}
|
||||
}
|
||||
StreamSupport.stream(root.spliterator(), false)
|
||||
.filter(entry -> entry.getName().startsWith("MBD"))
|
||||
.forEach(dirs::add);
|
||||
} else if (ext instanceof WordExtractor) {
|
||||
// These are in ObjectPool -> _... under the root
|
||||
try {
|
||||
DirectoryEntry op = (DirectoryEntry)
|
||||
root.getEntry("ObjectPool");
|
||||
Iterator<Entry> it = op.getEntries();
|
||||
while(it.hasNext()) {
|
||||
Entry entry = it.next();
|
||||
if(entry.getName().startsWith("_")) {
|
||||
dirs.add(entry);
|
||||
}
|
||||
}
|
||||
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
|
||||
StreamSupport.stream(op.spliterator(), false)
|
||||
.filter(entry -> entry.getName().startsWith("_"))
|
||||
.forEach(dirs::add);
|
||||
} catch(FileNotFoundException e) {
|
||||
logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage());
|
||||
// ignored here
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.poi.hdgf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
|
@ -38,11 +37,11 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
* Can operate on the command line (outputs to stdout), or
|
||||
* can return the text for you (example: for use with Lucene).
|
||||
*/
|
||||
public final class VisioTextExtractor extends POIOLE2TextExtractor {
|
||||
public final class VisioTextExtractor implements POIOLE2TextExtractor {
|
||||
private HDGFDiagram hdgf;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public VisioTextExtractor(HDGFDiagram hdgf) {
|
||||
super(hdgf);
|
||||
this.hdgf = hdgf;
|
||||
}
|
||||
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||
|
@ -91,9 +90,7 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor {
|
|||
// Capture the text, as long as it isn't
|
||||
// simply an empty string
|
||||
String str = cmd.getValue().toString();
|
||||
if(str.isEmpty() || "\n".equals(str)) {
|
||||
// Ignore empty strings
|
||||
} else {
|
||||
if (!(str.isEmpty() || "\n".equals(str))) {
|
||||
text.add( str );
|
||||
}
|
||||
}
|
||||
|
@ -121,21 +118,23 @@ public final class VisioTextExtractor extends POIOLE2TextExtractor {
|
|||
return text.toString();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length == 0) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" VisioTextExtractor <file.vsd>");
|
||||
System.exit(1);
|
||||
}
|
||||
@Override
|
||||
public HDGFDiagram getDocument() {
|
||||
return hdgf;
|
||||
}
|
||||
|
||||
try (FileInputStream fis = new FileInputStream(args[0])) {
|
||||
VisioTextExtractor extractor =
|
||||
new VisioTextExtractor(fis);
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
// Print not PrintLn as already has \n added to it
|
||||
System.out.print(extractor.getText());
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
extractor.close();
|
||||
}
|
||||
@Override
|
||||
public HDGFDiagram getFilesystem() {
|
||||
return hdgf;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,35 +17,37 @@
|
|||
|
||||
package org.apache.poi.hpbf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.extractor.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hpbf.HPBFDocument;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCBit;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCPLCBit.Type12;
|
||||
import org.apache.poi.hpbf.model.qcbits.QCTextBit;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Extract text from HPBF Publisher files
|
||||
*/
|
||||
public final class PublisherTextExtractor extends POIOLE2TextExtractor {
|
||||
private HPBFDocument doc;
|
||||
public final class PublisherTextExtractor implements POIOLE2TextExtractor {
|
||||
private final HPBFDocument doc;
|
||||
private boolean hyperlinksByDefault;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public PublisherTextExtractor(HPBFDocument doc) {
|
||||
super(doc);
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
public PublisherTextExtractor(DirectoryNode dir) throws IOException {
|
||||
this(new HPBFDocument(dir));
|
||||
}
|
||||
|
||||
public PublisherTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HPBFDocument(fs));
|
||||
}
|
||||
|
||||
public PublisherTextExtractor(InputStream is) throws IOException {
|
||||
this(new POIFSFileSystem(is));
|
||||
}
|
||||
|
@ -66,7 +68,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
|
|||
// Get the text from the Quill Contents
|
||||
QCBit[] bits = doc.getQuillContents().getBits();
|
||||
for (QCBit bit1 : bits) {
|
||||
if (bit1 != null && bit1 instanceof QCTextBit) {
|
||||
if (bit1 instanceof QCTextBit) {
|
||||
QCTextBit t = (QCTextBit) bit1;
|
||||
text.append(t.getText().replace('\r', '\n'));
|
||||
}
|
||||
|
@ -79,7 +81,7 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
|
|||
// how to tie that together.
|
||||
if(hyperlinksByDefault) {
|
||||
for (QCBit bit : bits) {
|
||||
if (bit != null && bit instanceof Type12) {
|
||||
if (bit instanceof Type12) {
|
||||
Type12 hyperlinks = (Type12) bit;
|
||||
for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) {
|
||||
text.append("<");
|
||||
|
@ -96,19 +98,23 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
|
|||
return text.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public HPBFDocument getDocument() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length == 0) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" PublisherTextExtractor <file.pub>");
|
||||
}
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
for (String arg : args) {
|
||||
try (FileInputStream fis = new FileInputStream(arg)) {
|
||||
PublisherTextExtractor te = new PublisherTextExtractor(fis);
|
||||
System.out.println(te.getText());
|
||||
te.close();
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HPBFDocument getFilesystem() {
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,279 +0,0 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hslf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.extractor.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hslf.usermodel.HSLFObjectShape;
|
||||
import org.apache.poi.hslf.usermodel.HSLFShape;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
|
||||
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
|
||||
import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.sl.extractor.SlideShowExtractor;
|
||||
import org.apache.poi.sl.usermodel.SlideShow;
|
||||
import org.apache.poi.sl.usermodel.SlideShowFactory;
|
||||
import org.apache.poi.util.Removal;
|
||||
|
||||
/**
|
||||
* This class can be used to extract text from a PowerPoint file. Can optionally
|
||||
* also get the notes from one.
|
||||
*
|
||||
* @deprecated in POI 4.0.0, use {@link SlideShowExtractor} instead
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
@Deprecated
|
||||
@Removal(version="5.0.0")
|
||||
public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
||||
private final SlideShowExtractor<HSLFShape,HSLFTextParagraph> delegate;
|
||||
|
||||
private boolean slidesByDefault = true;
|
||||
private boolean notesByDefault;
|
||||
private boolean commentsByDefault;
|
||||
private boolean masterByDefault;
|
||||
|
||||
/**
|
||||
* Basic extractor. Returns all the text, and optionally all the notes
|
||||
*/
|
||||
public static void main(String[] args) throws IOException {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Usage:");
|
||||
System.err.println("\tPowerPointExtractor [-notes] <file>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
boolean notes = false;
|
||||
boolean comments = false;
|
||||
boolean master = true;
|
||||
|
||||
String file;
|
||||
if (args.length > 1) {
|
||||
notes = true;
|
||||
file = args[1];
|
||||
if (args.length > 2) {
|
||||
comments = true;
|
||||
}
|
||||
} else {
|
||||
file = args[0];
|
||||
}
|
||||
|
||||
try (PowerPointExtractor ppe = new PowerPointExtractor(file)) {
|
||||
System.out.println(ppe.getText(true, notes, comments, master));
|
||||
}
|
||||
}
|
||||
|
||||
public PowerPointExtractor(final HSLFSlideShow slideShow) {
|
||||
super(slideShow.getSlideShowImpl());
|
||||
setFilesystem(slideShow);
|
||||
delegate = new SlideShowExtractor<>(slideShow);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a file
|
||||
*
|
||||
* @param fileName The name of the file to extract from
|
||||
*/
|
||||
public PowerPointExtractor(String fileName) throws IOException {
|
||||
this(createHSLF(new File(fileName), Biff8EncryptionKey.getCurrentUserPassword(), true));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from an Input Stream
|
||||
*
|
||||
* @param iStream The input stream containing the PowerPoint document
|
||||
*/
|
||||
public PowerPointExtractor(InputStream iStream) throws IOException {
|
||||
this(createHSLF(iStream, Biff8EncryptionKey.getCurrentUserPassword()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from an open POIFSFileSystem
|
||||
*
|
||||
* @param fs the POIFSFileSystem containing the PowerPoint document
|
||||
*/
|
||||
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(createHSLF(fs, Biff8EncryptionKey.getCurrentUserPassword()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a specific place
|
||||
* inside an open {@link POIFSFileSystem}
|
||||
*
|
||||
* @param dir the POIFS Directory containing the PowerPoint document
|
||||
*/
|
||||
public PowerPointExtractor(DirectoryNode dir) throws IOException {
|
||||
this(new HSLFSlideShow(dir));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a HSLFSlideShow
|
||||
*
|
||||
* @param ss the HSLFSlideShow to extract text from
|
||||
*/
|
||||
public PowerPointExtractor(HSLFSlideShowImpl ss) {
|
||||
this(new HSLFSlideShow(ss));
|
||||
}
|
||||
|
||||
/**
|
||||
* Should a call to getText() return slide text? Default is yes
|
||||
*/
|
||||
public void setSlidesByDefault(final boolean slidesByDefault) {
|
||||
this.slidesByDefault = slidesByDefault;
|
||||
delegate.setSlidesByDefault(slidesByDefault);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should a call to getText() return notes text? Default is no
|
||||
*/
|
||||
public void setNotesByDefault(final boolean notesByDefault) {
|
||||
this.notesByDefault = notesByDefault;
|
||||
delegate.setNotesByDefault(notesByDefault);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should a call to getText() return comments text? Default is no
|
||||
*/
|
||||
public void setCommentsByDefault(final boolean commentsByDefault) {
|
||||
this.commentsByDefault = commentsByDefault;
|
||||
delegate.setCommentsByDefault(commentsByDefault);
|
||||
}
|
||||
|
||||
/**
|
||||
* Should a call to getText() return text from master? Default is no
|
||||
*/
|
||||
public void setMasterByDefault(final boolean masterByDefault) {
|
||||
this.masterByDefault = masterByDefault;
|
||||
delegate.setMasterByDefault(masterByDefault);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches all the slide text from the slideshow, but not the notes, unless
|
||||
* you've called setSlidesByDefault() and setNotesByDefault() to change this
|
||||
*/
|
||||
@Override
|
||||
public String getText() {
|
||||
return delegate.getText();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches text from the slideshow, be it slide text or note text. Because
|
||||
* the final block of text in a TextRun normally have their last \n
|
||||
* stripped, we add it back
|
||||
*
|
||||
* @param getSlideText fetch slide text
|
||||
* @param getNoteText fetch note text
|
||||
*/
|
||||
public String getText(boolean getSlideText, boolean getNoteText) {
|
||||
return getText(getSlideText,getNoteText,commentsByDefault,masterByDefault);
|
||||
}
|
||||
|
||||
public String getText(boolean getSlideText, boolean getNoteText, boolean getCommentText, boolean getMasterText) {
|
||||
delegate.setSlidesByDefault(getSlideText);
|
||||
delegate.setNotesByDefault(getNoteText);
|
||||
delegate.setCommentsByDefault(getCommentText);
|
||||
delegate.setMasterByDefault(getMasterText);
|
||||
try {
|
||||
return delegate.getText();
|
||||
} finally {
|
||||
delegate.setSlidesByDefault(slidesByDefault);
|
||||
delegate.setNotesByDefault(notesByDefault);
|
||||
delegate.setCommentsByDefault(commentsByDefault);
|
||||
delegate.setMasterByDefault(masterByDefault);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches all the notes text from the slideshow, but not the slide text
|
||||
*/
|
||||
public String getNotes() {
|
||||
return getText(false, true, false, false);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<HSLFObjectShape> getOLEShapes() {
|
||||
return (List<HSLFObjectShape>)delegate.getOLEShapes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to avoid problems with compiling code in Eclipse
|
||||
*
|
||||
* Eclipse javac has some bugs with complex casts, this method tries
|
||||
* to work around this.
|
||||
*
|
||||
* @param fs The {@link POIFSFileSystem} to read the document from
|
||||
* @param password The password that should be used or null if no password is necessary.
|
||||
*
|
||||
* @return The created SlideShow
|
||||
*
|
||||
* @throws IOException if an error occurs while reading the data
|
||||
*/
|
||||
private static HSLFSlideShow createHSLF(POIFSFileSystem fs, String password) throws IOException, EncryptedDocumentException {
|
||||
// Note: don't change the code here, it is required for Eclipse to compile the code
|
||||
SlideShow slideShowOrig = SlideShowFactory.create(fs, password);
|
||||
return (HSLFSlideShow)slideShowOrig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to avoid problems with compiling code in Eclipse
|
||||
*
|
||||
* Eclipse javac has some bugs with complex casts, this method tries
|
||||
* to work around this.
|
||||
*
|
||||
* @param inp The {@link InputStream} to read data from.
|
||||
* @param password The password that should be used or null if no password is necessary.
|
||||
*
|
||||
* @return The created SlideShow
|
||||
*
|
||||
* @throws IOException if an error occurs while reading the data
|
||||
* @throws EncryptedDocumentException If the wrong password is given for a protected file
|
||||
*/
|
||||
private static HSLFSlideShow createHSLF(InputStream inp, String password) throws IOException, EncryptedDocumentException {
|
||||
// Note: don't change the code here, it is required for Eclipse to compile the code
|
||||
SlideShow slideShowOrig = SlideShowFactory.create(inp, password);
|
||||
return (HSLFSlideShow)slideShowOrig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to avoid problems with compiling code in Eclipse
|
||||
*
|
||||
* Eclipse javac has some bugs with complex casts, this method tries
|
||||
* to work around this.
|
||||
*
|
||||
* @param file The file to read data from.
|
||||
* @param password The password that should be used or null if no password is necessary.
|
||||
* @param readOnly If the SlideShow should be opened in read-only mode to avoid writing back
|
||||
* changes when the document is closed.
|
||||
*
|
||||
* @return The created SlideShow
|
||||
*
|
||||
* @throws IOException if an error occurs while reading the data
|
||||
* @throws EncryptedDocumentException If the wrong password is given for a protected file
|
||||
*/
|
||||
private static HSLFSlideShow createHSLF(File file, String password, boolean readOnly) throws IOException, EncryptedDocumentException {
|
||||
// Note: don't change the code here, it is required for Eclipse to compile the code
|
||||
SlideShow slideShowOrig = SlideShowFactory.create(file, password, readOnly);
|
||||
return (HSLFSlideShow)slideShowOrig;
|
||||
}
|
||||
}
|
|
@ -33,6 +33,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.common.usermodel.GenericRecord;
|
||||
import org.apache.poi.common.usermodel.fonts.FontInfo;
|
||||
import org.apache.poi.ddf.EscherBSERecord;
|
||||
|
@ -40,6 +41,9 @@ import org.apache.poi.ddf.EscherContainerRecord;
|
|||
import org.apache.poi.ddf.EscherOptRecord;
|
||||
import org.apache.poi.hpsf.ClassID;
|
||||
import org.apache.poi.hpsf.ClassIDPredefined;
|
||||
import org.apache.poi.hpsf.DocumentSummaryInformation;
|
||||
import org.apache.poi.hpsf.PropertySet;
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
|
||||
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException;
|
||||
import org.apache.poi.hslf.exceptions.HSLFException;
|
||||
|
@ -47,6 +51,7 @@ import org.apache.poi.hslf.model.HeadersFooters;
|
|||
import org.apache.poi.hslf.model.MovieShape;
|
||||
import org.apache.poi.hslf.record.*;
|
||||
import org.apache.poi.hslf.record.SlideListWithText.SlideAtomsSet;
|
||||
import org.apache.poi.poifs.crypt.EncryptionInfo;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.Ole10Native;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
@ -66,7 +71,7 @@ import org.apache.poi.util.Units;
|
|||
* TODO: - figure out how to match notes to their correct sheet (will involve
|
||||
* understanding DocSlideList and DocNotesList) - handle Slide creation cleaner
|
||||
*/
|
||||
public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord {
|
||||
public final class HSLFSlideShow extends POIDocument implements SlideShow<HSLFShape,HSLFTextParagraph>, Closeable, GenericRecord {
|
||||
|
||||
//arbitrarily selected; may need to increase
|
||||
private static final int MAX_RECORD_LENGTH = 10_000_000;
|
||||
|
@ -111,6 +116,8 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
|
|||
* @param hslfSlideShow the HSLFSlideShow to base on
|
||||
*/
|
||||
public HSLFSlideShow(HSLFSlideShowImpl hslfSlideShow) {
|
||||
super(hslfSlideShow.getDirectory());
|
||||
|
||||
loadSavePhase.set(LoadSavePhase.INIT);
|
||||
|
||||
// Get useful things from our base slideshow
|
||||
|
@ -1179,4 +1186,94 @@ public final class HSLFSlideShow implements SlideShow<HSLFShape,HSLFTextParagrap
|
|||
public List<? extends GenericRecord> getGenericChildren() {
|
||||
return Arrays.asList(_hslfSlideShow.getRecords());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write() throws IOException {
|
||||
getSlideShowImpl().write();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(File newFile) throws IOException {
|
||||
getSlideShowImpl().write(newFile);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocumentSummaryInformation getDocumentSummaryInformation() {
|
||||
return getSlideShowImpl().getDocumentSummaryInformation();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SummaryInformation getSummaryInformation() {
|
||||
return getSlideShowImpl().getSummaryInformation();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void createInformationProperties() {
|
||||
getSlideShowImpl().createInformationProperties();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readProperties() {
|
||||
getSlideShowImpl().readProperties();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PropertySet getPropertySet(String setName) throws IOException {
|
||||
return getSlideShowImpl().getPropertySetImpl(setName);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PropertySet getPropertySet(String setName, EncryptionInfo encryptionInfo) throws IOException {
|
||||
return getSlideShowImpl().getPropertySetImpl(setName, encryptionInfo);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeProperties() throws IOException {
|
||||
getSlideShowImpl().writePropertiesImpl();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeProperties(POIFSFileSystem outFS) throws IOException {
|
||||
getSlideShowImpl().writeProperties(outFS);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeProperties(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException {
|
||||
getSlideShowImpl().writePropertiesImpl(outFS, writtenEntries);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validateInPlaceWritePossible() throws IllegalStateException {
|
||||
getSlideShowImpl().validateInPlaceWritePossibleImpl();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DirectoryNode getDirectory() {
|
||||
return getSlideShowImpl().getDirectory();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void clearDirectory() {
|
||||
getSlideShowImpl().clearDirectoryImpl();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean initDirectory() {
|
||||
return getSlideShowImpl().initDirectoryImpl();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void replaceDirectory(DirectoryNode newDirectory) {
|
||||
getSlideShowImpl().replaceDirectoryImpl(newDirectory);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getEncryptedPropertyStreamName() {
|
||||
return getSlideShowImpl().getEncryptedPropertyStreamName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public EncryptionInfo getEncryptionInfo() throws IOException {
|
||||
return getSlideShowImpl().getEncryptionInfo();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ import java.util.NavigableMap;
|
|||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.hpsf.PropertySet;
|
||||
import org.apache.poi.hslf.exceptions.CorruptPowerPointFileException;
|
||||
import org.apache.poi.hslf.exceptions.HSLFException;
|
||||
import org.apache.poi.hslf.exceptions.OldPowerPointFormatException;
|
||||
|
@ -714,8 +715,6 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/* ******************* adding methods follow ********************* */
|
||||
|
||||
/**
|
||||
|
@ -850,6 +849,38 @@ public final class HSLFSlideShowImpl extends POIDocument implements Closeable {
|
|||
return "EncryptedSummary";
|
||||
}
|
||||
|
||||
void writePropertiesImpl() throws IOException {
|
||||
super.writeProperties();
|
||||
}
|
||||
|
||||
PropertySet getPropertySetImpl(String setName) throws IOException {
|
||||
return super.getPropertySet(setName);
|
||||
}
|
||||
|
||||
PropertySet getPropertySetImpl(String setName, EncryptionInfo encryptionInfo) throws IOException {
|
||||
return super.getPropertySet(setName, encryptionInfo);
|
||||
}
|
||||
|
||||
void writePropertiesImpl(POIFSFileSystem outFS, List<String> writtenEntries) throws IOException {
|
||||
super.writeProperties(outFS, writtenEntries);
|
||||
}
|
||||
|
||||
void validateInPlaceWritePossibleImpl() throws IllegalStateException {
|
||||
super.validateInPlaceWritePossible();
|
||||
}
|
||||
|
||||
void clearDirectoryImpl() {
|
||||
super.clearDirectory();
|
||||
}
|
||||
|
||||
boolean initDirectoryImpl() {
|
||||
return super.initDirectory();
|
||||
}
|
||||
|
||||
void replaceDirectoryImpl(DirectoryNode newDirectory) {
|
||||
super.replaceDirectory(newDirectory);
|
||||
}
|
||||
|
||||
private static class BufAccessBAOS extends ByteArrayOutputStream {
|
||||
public byte[] getBuf() {
|
||||
return buf;
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hsmf.extractor;
|
||||
|
||||
import org.apache.poi.hsmf.MAPIMessage;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.Removal;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
/**
|
||||
* A text extractor for HSMF (Outlook) .msg files.
|
||||
* Outputs in a format somewhat like a plain text email.
|
||||
*
|
||||
* @deprecated use @{link OutlookTextExtractor} instead
|
||||
*/
|
||||
@Deprecated
|
||||
@Removal(version = "5.0.0")
|
||||
public class OutlookTextExtactor extends OutlookTextExtractor {
|
||||
public OutlookTextExtactor(MAPIMessage msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
public OutlookTextExtactor(DirectoryNode poifsDir) throws IOException {
|
||||
super(new MAPIMessage(poifsDir));
|
||||
}
|
||||
|
||||
public OutlookTextExtactor(POIFSFileSystem fs) throws IOException {
|
||||
super(new MAPIMessage(fs));
|
||||
}
|
||||
|
||||
public OutlookTextExtactor(InputStream inp) throws IOException {
|
||||
super(new MAPIMessage(inp));
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
for (String filename : args) {
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(new File(filename));
|
||||
OutlookTextExtractor extractor = new OutlookTextExtractor(poifs)) {
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -42,9 +42,12 @@ import org.apache.poi.util.LocaleUtil;
|
|||
*
|
||||
* @since 4.1.2
|
||||
*/
|
||||
public class OutlookTextExtractor extends POIOLE2TextExtractor {
|
||||
public class OutlookTextExtractor implements POIOLE2TextExtractor {
|
||||
private final MAPIMessage msg;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
public OutlookTextExtractor(MAPIMessage msg) {
|
||||
super(msg);
|
||||
this.msg = msg;
|
||||
}
|
||||
|
||||
public OutlookTextExtractor(DirectoryNode poifsDir) throws IOException {
|
||||
|
@ -76,14 +79,13 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor {
|
|||
* Returns the underlying MAPI message
|
||||
*/
|
||||
public MAPIMessage getMAPIMessage() {
|
||||
return (MAPIMessage) document;
|
||||
return msg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Outputs something a little like a RFC822 email
|
||||
*/
|
||||
public String getText() {
|
||||
MAPIMessage msg = (MAPIMessage) document;
|
||||
StringBuilder s = new StringBuilder();
|
||||
|
||||
// See if we can get a suitable encoding for any
|
||||
|
@ -201,4 +203,24 @@ public class OutlookTextExtractor extends POIOLE2TextExtractor {
|
|||
}
|
||||
s.append("\n");
|
||||
}
|
||||
|
||||
@Override
|
||||
public MAPIMessage getDocument() {
|
||||
return msg;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MAPIMessage getFilesystem() {
|
||||
return msg;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,8 +36,9 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public final class Word6Extractor extends POIOLE2TextExtractor {
|
||||
public final class Word6Extractor implements POIOLE2TextExtractor {
|
||||
private HWPFOldDocument doc;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
|
@ -53,8 +54,7 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
|||
* @param fs
|
||||
* POIFSFileSystem containing the word file
|
||||
*/
|
||||
public Word6Extractor( POIFSFileSystem fs ) throws IOException
|
||||
{
|
||||
public Word6Extractor( POIFSFileSystem fs ) throws IOException {
|
||||
this( fs.getRoot() );
|
||||
}
|
||||
|
||||
|
@ -62,14 +62,11 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
|||
* @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
|
||||
*/
|
||||
@Deprecated
|
||||
public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
|
||||
throws IOException
|
||||
{
|
||||
public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs ) throws IOException {
|
||||
this( dir );
|
||||
}
|
||||
|
||||
public Word6Extractor( DirectoryNode dir ) throws IOException
|
||||
{
|
||||
public Word6Extractor( DirectoryNode dir ) throws IOException {
|
||||
this( new HWPFOldDocument( dir ) );
|
||||
}
|
||||
|
||||
|
@ -78,7 +75,6 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
|||
* @param doc The HWPFOldDocument to extract from
|
||||
*/
|
||||
public Word6Extractor(HWPFOldDocument doc) {
|
||||
super(doc);
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
|
@ -111,25 +107,40 @@ public final class Word6Extractor extends POIOLE2TextExtractor {
|
|||
return ret;
|
||||
}
|
||||
|
||||
public String getText()
|
||||
{
|
||||
try
|
||||
{
|
||||
public String getText() {
|
||||
try {
|
||||
WordToTextConverter wordToTextConverter = new WordToTextConverter();
|
||||
wordToTextConverter.processDocument( doc );
|
||||
return wordToTextConverter.getText();
|
||||
}
|
||||
catch ( Exception exc )
|
||||
{
|
||||
} catch ( Exception exc ) {
|
||||
// fall-back
|
||||
StringBuilder text = new StringBuilder();
|
||||
|
||||
for ( String t : getParagraphText() )
|
||||
{
|
||||
for ( String t : getParagraphText() ) {
|
||||
text.append( t );
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public HWPFOldDocument getDocument() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HWPFOldDocument getFilesystem() {
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
|
@ -39,8 +38,9 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public final class WordExtractor extends POIOLE2TextExtractor {
|
||||
private HWPFDocument doc;
|
||||
public final class WordExtractor implements POIOLE2TextExtractor {
|
||||
private final HWPFDocument doc;
|
||||
private boolean doCloseFilesystem = true;
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
|
@ -73,29 +73,9 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
|||
* The HWPFDocument to extract from
|
||||
*/
|
||||
public WordExtractor( HWPFDocument doc ) {
|
||||
super( doc );
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Command line extractor, so people will stop moaning that they can't just
|
||||
* run this.
|
||||
*/
|
||||
public static void main( String[] args ) throws IOException {
|
||||
if ( args.length == 0 ) {
|
||||
System.err.println( "Use:" );
|
||||
System.err
|
||||
.println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
|
||||
System.exit( 1 );
|
||||
}
|
||||
|
||||
// Process the first argument as a file
|
||||
InputStream fin = new FileInputStream( args[0] );
|
||||
try (WordExtractor extractor = new WordExtractor(fin)) {
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text from the word file, as an array with one String per
|
||||
* paragraph
|
||||
|
@ -142,7 +122,7 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
|||
return getParagraphText( r );
|
||||
}
|
||||
|
||||
protected static String[] getParagraphText( Range r ) {
|
||||
static String[] getParagraphText( Range r ) {
|
||||
String[] ret;
|
||||
ret = new String[r.numParagraphs()];
|
||||
for ( int i = 0; i < ret.length; i++ ) {
|
||||
|
@ -287,8 +267,27 @@ public final class WordExtractor extends POIOLE2TextExtractor {
|
|||
/**
|
||||
* Removes any fields (eg macros, page markers etc) from the string.
|
||||
*/
|
||||
public static String stripFields( String text )
|
||||
{
|
||||
public static String stripFields( String text ) {
|
||||
return Range.stripFields( text );
|
||||
}
|
||||
|
||||
@Override
|
||||
public HWPFDocument getDocument() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setCloseFilesystem(boolean doCloseFilesystem) {
|
||||
this.doCloseFilesystem = doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCloseFilesystem() {
|
||||
return doCloseFilesystem;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HWPFDocument getFilesystem() {
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,12 +19,9 @@ package org.apache.poi.hdgf.extractor;
|
|||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PrintStream;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.hdgf.HDGFDiagram;
|
||||
|
@ -32,7 +29,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
import org.junit.Test;
|
||||
|
||||
public final class TestVisioExtractor {
|
||||
private static POIDataSamples _dgTests = POIDataSamples.getDiagramInstance();
|
||||
private static final POIDataSamples _dgTests = POIDataSamples.getDiagramInstance();
|
||||
|
||||
private final String defFilename = "Test_Visio-Some_Random_Text.vsd";
|
||||
private final int defTextChunks = 5;
|
||||
|
@ -108,31 +105,6 @@ public final class TestVisioExtractor {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMain() throws Exception {
|
||||
PrintStream oldOut = System.out;
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
PrintStream capture = new PrintStream(baos);
|
||||
System.setOut(capture);
|
||||
|
||||
String path = _dgTests.getFile(defFilename).getPath();
|
||||
VisioTextExtractor.main(new String[] {path});
|
||||
|
||||
// Put things back
|
||||
System.setOut(oldOut);
|
||||
|
||||
// Check
|
||||
capture.flush();
|
||||
String text = baos.toString();
|
||||
// YK: stdout can contain lots of other stuff if logging is sent to console
|
||||
// ( -Dorg.apache.poi.util.POILogger=org.apache.poi.util.SystemOutLogger)
|
||||
assertTrue( text.contains(
|
||||
"text\nView\n" +
|
||||
"Test View\nI am a test view\n" +
|
||||
"Some random text, on a page\n"
|
||||
));
|
||||
}
|
||||
|
||||
private VisioTextExtractor openExtractor(String fileName) throws IOException {
|
||||
try (InputStream is = _dgTests.openResourceAsStream(fileName)) {
|
||||
return new VisioTextExtractor(is);
|
||||
|
|
|
@ -42,7 +42,6 @@ import org.apache.poi.hsmf.datatypes.PropertyValue;
|
|||
import org.apache.poi.hsmf.datatypes.PropertyValue.LongPropertyValue;
|
||||
import org.apache.poi.hsmf.datatypes.PropertyValue.TimePropertyValue;
|
||||
import org.apache.poi.hsmf.dev.HSMFDump;
|
||||
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
|
||||
import org.apache.poi.hsmf.extractor.OutlookTextExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.LocaleUtil;
|
||||
|
@ -144,30 +143,20 @@ public final class TestFixedSizedProperties {
|
|||
@Test
|
||||
public void testReadMessageDateSucceedsWithOutlookTextExtractor() throws Exception {
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageSucceeds);
|
||||
ext.setFilesystem(null); // Don't close re-used test resources here
|
||||
ext.setCloseFilesystem(false);
|
||||
|
||||
String text = ext.getText();
|
||||
assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n");
|
||||
ext.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadMessageDateSucceedsWithOutlookTextExtactor() throws Exception {
|
||||
OutlookTextExtactor ext = new OutlookTextExtactor(mapiMessageSucceeds);
|
||||
ext.setFilesystem(null); // Don't close re-used test resources here
|
||||
|
||||
String text = ext.getText();
|
||||
assertContains(text, "Date: Fri, 22 Jun 2012 18:32:54 +0000\n");
|
||||
ext.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test to see if we can read the Date Chunk with OutlookTextExtractor.
|
||||
*/
|
||||
@Test
|
||||
public void testReadMessageDateFailsWithOutlookTextExtractor() throws Exception {
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(mapiMessageFails);
|
||||
ext.setFilesystem(null); // Don't close re-used test resources here
|
||||
ext.setCloseFilesystem(false);
|
||||
|
||||
String text = ext.getText();
|
||||
assertContains(text, "Date: Thu, 21 Jun 2012 14:14:04 +0000\n");
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.poi.hsmf.extractor;
|
|||
import static org.apache.poi.POITestCase.assertContains;
|
||||
import static org.apache.poi.POITestCase.assertNotContained;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
@ -57,68 +56,62 @@ public final class TestOutlookTextExtractor {
|
|||
|
||||
@Test
|
||||
public void testQuick() throws Exception {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("quick.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
|
||||
String text = ext.getText();
|
||||
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg);
|
||||
String text = ext.getText();
|
||||
|
||||
assertContains(text, "From: Kevin Roast\n");
|
||||
assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n");
|
||||
assertNotContained(text, "CC:");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertNotContained(text, "Attachment:");
|
||||
assertContains(text, "Subject: Test the content transformer\n");
|
||||
Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55);
|
||||
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
|
||||
f.setTimeZone(LocaleUtil.getUserTimeZone());
|
||||
String dateText = f.format(cal.getTime());
|
||||
assertContains(text, "Date: " + dateText + "\n");
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
assertContains(text, "From: Kevin Roast\n");
|
||||
assertContains(text, "To: Kevin Roast <kevin.roast@alfresco.org>\n");
|
||||
assertNotContained(text, "CC:");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertNotContained(text, "Attachment:");
|
||||
assertContains(text, "Subject: Test the content transformer\n");
|
||||
Calendar cal = LocaleUtil.getLocaleCalendar(2007, 5, 14, 9, 42, 55);
|
||||
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss Z", Locale.ROOT);
|
||||
f.setTimeZone(LocaleUtil.getUserTimeZone());
|
||||
String dateText = f.format(cal.getTime());
|
||||
assertContains(text, "Date: " + dateText + "\n");
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimple() throws Exception {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
|
||||
String text = ext.getText();
|
||||
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg);
|
||||
String text = ext.getText();
|
||||
|
||||
assertContains(text, "From: Travis Ferguson\n");
|
||||
assertContains(text, "To: travis@overwrittenstack.com\n");
|
||||
assertNotContained(text, "CC:");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertContains(text, "Subject: test message\n");
|
||||
assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n");
|
||||
assertContains(text, "This is a test message.");
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
assertContains(text, "From: Travis Ferguson\n");
|
||||
assertContains(text, "To: travis@overwrittenstack.com\n");
|
||||
assertNotContained(text, "CC:");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertContains(text, "Subject: test message\n");
|
||||
assertContains(text, "Date: Fri, 6 Jul 2007 05:27:17 +0000\n");
|
||||
assertContains(text, "This is a test message.");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConstructors() throws Exception {
|
||||
FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(fis);
|
||||
String inp = ext.getText();
|
||||
ext.close();
|
||||
fis.close();
|
||||
String inp;
|
||||
try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(fis)) {
|
||||
inp = ext.getText();
|
||||
}
|
||||
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
|
||||
ext = new OutlookTextExtractor(poifs);
|
||||
String poifsTxt = ext.getText();
|
||||
ext.close();
|
||||
poifs.close();
|
||||
String poifsTxt;
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("simple_test_msg.msg"), true);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(poifs)){
|
||||
poifsTxt = ext.getText();
|
||||
}
|
||||
|
||||
fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
|
||||
ext = new OutlookTextExtractor(new MAPIMessage(fis));
|
||||
String mapi = ext.getText();
|
||||
ext.close();
|
||||
fis.close();
|
||||
String mapi;
|
||||
try (FileInputStream fis = new FileInputStream(samples.getFile("simple_test_msg.msg"));
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(new MAPIMessage(fis))) {
|
||||
mapi = ext.getText();
|
||||
}
|
||||
|
||||
assertEquals(inp, poifsTxt);
|
||||
assertEquals(inp, mapi);
|
||||
|
@ -142,25 +135,22 @@ public final class TestOutlookTextExtractor {
|
|||
"example_sent_regular.msg", "example_sent_unicode.msg"
|
||||
};
|
||||
for (String file : files) {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
|
||||
String text = ext.getText();
|
||||
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg);
|
||||
String text = ext.getText();
|
||||
|
||||
assertContains(text, "From: Mike Farman\n");
|
||||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
|
||||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
|
||||
assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " +
|
||||
"'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
|
||||
assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " +
|
||||
"'Vonka Jan' <jan.vonka@alfresco.com>\n");
|
||||
assertContains(text, "Subject: This is a test message please ignore\n");
|
||||
assertContains(text, "Date:");
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
assertContains(text, "From: Mike Farman\n");
|
||||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
|
||||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
|
||||
assertContains(text, "CC: 'nickb@alfresco.com' <nickb@alfresco.com>; " +
|
||||
"'nick.burch@alfresco.com' <nick.burch@alfresco.com>; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
|
||||
assertContains(text, "BCC: 'David Caruana' <dave.caruana@alfresco.com>; " +
|
||||
"'Vonka Jan' <jan.vonka@alfresco.com>\n");
|
||||
assertContains(text, "Subject: This is a test message please ignore\n");
|
||||
assertContains(text, "Date:");
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -182,25 +172,21 @@ public final class TestOutlookTextExtractor {
|
|||
"example_received_regular.msg", "example_received_unicode.msg"
|
||||
};
|
||||
for (String file : files) {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile(file), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
|
||||
String text = ext.getText();
|
||||
|
||||
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg);
|
||||
String text = ext.getText();
|
||||
|
||||
assertContains(text, "From: Mike Farman\n");
|
||||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
|
||||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
|
||||
assertContains(text, "CC: nickb@alfresco.com; " +
|
||||
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertContains(text, "Subject: This is a test message please ignore\n");
|
||||
assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
assertContains(text, "From: Mike Farman\n");
|
||||
assertContains(text, "To: 'Ashutosh Dandavate' <ashutosh.dandavate@alfresco.com>; " +
|
||||
"'Paul Holmes-Higgin' <paul.hh@alfresco.com>; 'Mike Farman' <mikef@alfresco.com>\n");
|
||||
assertContains(text, "CC: nickb@alfresco.com; " +
|
||||
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertContains(text, "Subject: This is a test message please ignore\n");
|
||||
assertContains(text, "Date: Mon, 11 Jan 2010 16:2"); // Exact times differ slightly
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -210,85 +196,59 @@ public final class TestOutlookTextExtractor {
|
|||
@SuppressWarnings("JavadocReference")
|
||||
@Test
|
||||
public void testWithAttachments() throws Exception {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg);
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("attachment_test_msg.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
|
||||
|
||||
// Check the normal bits
|
||||
String text = ext.getText();
|
||||
// Check the normal bits
|
||||
String text = ext.getText();
|
||||
|
||||
assertContains(text, "From: Nicolas1");
|
||||
assertContains(text, "To: 'nicolas1.23456@free.fr'");
|
||||
assertNotContained(text, "CC:");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertContains(text, "Subject: test");
|
||||
assertContains(text, "Date: Wed, 22 Apr");
|
||||
assertContains(text, "Attachment: test-unicode.doc\n");
|
||||
assertContains(text, "Attachment: pj1.txt\n");
|
||||
assertContains(text, "contenu");
|
||||
assertContains(text, "From: Nicolas1");
|
||||
assertContains(text, "To: 'nicolas1.23456@free.fr'");
|
||||
assertNotContained(text, "CC:");
|
||||
assertNotContained(text, "BCC:");
|
||||
assertContains(text, "Subject: test");
|
||||
assertContains(text, "Date: Wed, 22 Apr");
|
||||
assertContains(text, "Attachment: test-unicode.doc\n");
|
||||
assertContains(text, "Attachment: pj1.txt\n");
|
||||
assertContains(text, "contenu");
|
||||
|
||||
// Embeded bits are checked in
|
||||
// TestExtractorFactory
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
// Embeded bits are checked in
|
||||
// TestExtractorFactory
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithAttachedMessage() throws Exception {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg);
|
||||
String text = ext.getText();
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("58214_with_attachment.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
|
||||
String text = ext.getText();
|
||||
|
||||
// Check we got bits from the main message
|
||||
assertContains(text, "Master mail");
|
||||
assertContains(text, "ante in lacinia euismod");
|
||||
// Check we got bits from the main message
|
||||
assertContains(text, "Master mail");
|
||||
assertContains(text, "ante in lacinia euismod");
|
||||
|
||||
// But not the attached message
|
||||
assertNotContained(text, "Test mail attachment");
|
||||
assertNotContained(text, "Lorem ipsum dolor sit");
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
// But not the attached message
|
||||
assertNotContained(text, "Test mail attachment");
|
||||
assertNotContained(text, "Lorem ipsum dolor sit");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEncodings() throws Exception {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg);
|
||||
String text = ext.getText();
|
||||
try (POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtractor ext = new OutlookTextExtractor(msg)) {
|
||||
String text = ext.getText();
|
||||
|
||||
// Check the english bits
|
||||
assertContains(text, "From: Tests Chang@FT");
|
||||
assertContains(text, "tests.chang@fengttt.com");
|
||||
// Check the english bits
|
||||
assertContains(text, "From: Tests Chang@FT");
|
||||
assertContains(text, "tests.chang@fengttt.com");
|
||||
|
||||
// And check some chinese bits
|
||||
assertContains(text, "(\u5f35\u6bd3\u502b)");
|
||||
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEncodingsDeprecatedClass() throws Exception {
|
||||
POIFSFileSystem poifs = new POIFSFileSystem(samples.getFile("chinese-traditional.msg"), true);
|
||||
MAPIMessage msg = new MAPIMessage(poifs);
|
||||
OutlookTextExtactor ext = new OutlookTextExtactor(msg);
|
||||
assertTrue("OutlookTextExtactor instanceof OutlookTextExtractor", ext instanceof OutlookTextExtractor);
|
||||
String text = ext.getText();
|
||||
|
||||
// Check the english bits
|
||||
assertContains(text, "From: Tests Chang@FT");
|
||||
assertContains(text, "tests.chang@fengttt.com");
|
||||
|
||||
// And check some chinese bits
|
||||
assertContains(text, "(\u5f35\u6bd3\u502b)");
|
||||
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
|
||||
|
||||
ext.close();
|
||||
poifs.close();
|
||||
// And check some chinese bits
|
||||
assertContains(text, "(\u5f35\u6bd3\u502b)");
|
||||
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,16 +17,16 @@
|
|||
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.extractor.OLE2ExtractorFactory;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.extractor.ExtractorFactory;
|
||||
import org.apache.poi.extractor.POITextExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests for bugs with the WordExtractor
|
||||
|
@ -61,7 +61,7 @@ public final class TestWordExtractorBugs {
|
|||
@Test
|
||||
public void testBug60374() throws Exception {
|
||||
POIFSFileSystem fs = new POIFSFileSystem(SAMPLES.openResourceAsStream("cn.orthodox.www_divenbog_APRIL_30-APRIL.DOC"));
|
||||
final POITextExtractor extractor = OLE2ExtractorFactory.createExtractor(fs);
|
||||
final POITextExtractor extractor = ExtractorFactory.createExtractor(fs);
|
||||
|
||||
// Check it gives text without error
|
||||
assertNotNull(extractor.getText());
|
||||
|
|
|
@ -25,7 +25,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.hpsf.*;
|
||||
import org.apache.poi.hpsf.Thumbnail;
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
import org.apache.poi.hssf.extractor.ExcelExtractor;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
|
@ -101,42 +101,31 @@ public final class TestHPSFPropertiesExtractor {
|
|||
|
||||
@Test
|
||||
public void testConstructors() throws IOException {
|
||||
POIFSFileSystem fs;
|
||||
HSSFWorkbook wb;
|
||||
try {
|
||||
fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls"));
|
||||
wb = new HSSFWorkbook(fs);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
ExcelExtractor excelExt = new ExcelExtractor(wb);
|
||||
|
||||
final String fsText;
|
||||
HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs);
|
||||
fsExt.setFilesystem(null); // Don't close re-used test resources!
|
||||
try {
|
||||
fsText = fsExt.getText();
|
||||
} finally {
|
||||
fsExt.close();
|
||||
}
|
||||
|
||||
final String hwText;
|
||||
HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb);
|
||||
hwExt.setFilesystem(null); // Don't close re-used test resources!
|
||||
try {
|
||||
hwText = hwExt.getText();
|
||||
} finally {
|
||||
hwExt.close();
|
||||
}
|
||||
|
||||
final String eeText;
|
||||
HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt);
|
||||
eeExt.setFilesystem(null); // Don't close re-used test resources!
|
||||
try {
|
||||
eeText = eeExt.getText();
|
||||
} finally {
|
||||
eeExt.close();
|
||||
wb.close();
|
||||
|
||||
try (POIFSFileSystem fs = new POIFSFileSystem(_samples.openResourceAsStream("TestUnicode.xls"));
|
||||
HSSFWorkbook wb = new HSSFWorkbook(fs);
|
||||
ExcelExtractor excelExt = new ExcelExtractor(wb)) {
|
||||
|
||||
try (HPSFPropertiesExtractor fsExt = new HPSFPropertiesExtractor(fs)) {
|
||||
// Don't close re-used test resources!
|
||||
fsExt.setCloseFilesystem(false);
|
||||
fsText = fsExt.getText();
|
||||
}
|
||||
|
||||
try (HPSFPropertiesExtractor hwExt = new HPSFPropertiesExtractor(wb)) {
|
||||
// Don't close re-used test resources!
|
||||
hwExt.setCloseFilesystem(false);
|
||||
hwText = hwExt.getText();
|
||||
}
|
||||
|
||||
try (HPSFPropertiesExtractor eeExt = new HPSFPropertiesExtractor(excelExt)) {
|
||||
// Don't close re-used test resources!
|
||||
eeExt.setCloseFilesystem(false);
|
||||
eeText = eeExt.getText();
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(fsText, hwText);
|
||||
|
|
|
@ -43,9 +43,7 @@ public final class TestExcelExtractor {
|
|||
private static ExcelExtractor createExtractor(String sampleFileName) throws IOException {
|
||||
File file = HSSFTestDataSamples.getSampleFile(sampleFileName);
|
||||
POIFSFileSystem fs = new POIFSFileSystem(file);
|
||||
ExcelExtractor extractor = new ExcelExtractor(fs);
|
||||
extractor.setFilesystem(fs);
|
||||
return extractor;
|
||||
return new ExcelExtractor(fs);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue