mirror of https://github.com/apache/poi.git
Start moving logic over into the main and scratchpad jars for OLE2
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752226 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a5f19ab07f
commit
ef2af2d53d
|
@ -20,8 +20,10 @@ import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAME
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.lang.reflect.Method;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.POITextExtractor;
|
import org.apache.poi.POITextExtractor;
|
||||||
|
@ -33,6 +35,8 @@ import org.apache.poi.poifs.filesystem.Entry;
|
||||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||||
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.util.POILogFactory;
|
||||||
|
import org.apache.poi.util.POILogger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Figures out the correct POIOLE2TextExtractor for your supplied
|
* Figures out the correct POIOLE2TextExtractor for your supplied
|
||||||
|
@ -48,6 +52,8 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("WeakerAccess")
|
@SuppressWarnings("WeakerAccess")
|
||||||
public class OLE2ExtractorFactory {
|
public class OLE2ExtractorFactory {
|
||||||
|
private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
|
||||||
|
|
||||||
/** Should this thread prefer event based over usermodel based extractors? */
|
/** Should this thread prefer event based over usermodel based extractors? */
|
||||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -115,9 +121,36 @@ public class OLE2ExtractorFactory {
|
||||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(InputStream input) {
|
public static POITextExtractor createExtractor(InputStream input) throws IOException {
|
||||||
// TODO Something nasty with reflection...
|
Class<?> cls = getOOXMLClass();
|
||||||
return null;
|
if (cls != null) {
|
||||||
|
// TODO Reflection
|
||||||
|
throw new IllegalArgumentException("TODO Reflection");
|
||||||
|
} else {
|
||||||
|
// Best hope it's OLE2....
|
||||||
|
return createExtractor(new NPOIFSFileSystem(input));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Class<?> getOOXMLClass() {
|
||||||
|
try {
|
||||||
|
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
|
||||||
|
"org.apache.poi.extractor.ExtractorFactory"
|
||||||
|
);
|
||||||
|
} catch (ClassNotFoundException e) {
|
||||||
|
LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private static Class<?> getScratchpadClass() {
|
||||||
|
try {
|
||||||
|
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
|
||||||
|
"org.apache.poi.extractor.OLE2ScrachpadExtractorFactory"
|
||||||
|
);
|
||||||
|
} catch (ClassNotFoundException e) {
|
||||||
|
LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
|
||||||
|
throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -139,7 +172,15 @@ public class OLE2ExtractorFactory {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO Try to ask the Scratchpad
|
// Ask Scratchpad, or fail trying
|
||||||
|
Class<?> cls = getScratchpadClass();
|
||||||
|
try {
|
||||||
|
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
|
||||||
|
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
|
||||||
|
if (ext != null) return ext;
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
|
||||||
|
}
|
||||||
|
|
||||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
||||||
}
|
}
|
||||||
|
@ -155,9 +196,9 @@ public class OLE2ExtractorFactory {
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
// All the embedded directories we spotted
|
// All the embedded directories we spotted
|
||||||
ArrayList<Entry> dirs = new ArrayList<Entry>();
|
List<Entry> dirs = new ArrayList<Entry>();
|
||||||
// For anything else not directly held in as a POIFS directory
|
// For anything else not directly held in as a POIFS directory
|
||||||
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
List<InputStream> nonPOIFS = new ArrayList<InputStream>();
|
||||||
|
|
||||||
// Find all the embedded directories
|
// Find all the embedded directories
|
||||||
DirectoryEntry root = ext.getRoot();
|
DirectoryEntry root = ext.getRoot();
|
||||||
|
@ -175,7 +216,15 @@ public class OLE2ExtractorFactory {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO Ask scratchpad
|
// Ask Scratchpad, or fail trying
|
||||||
|
Class<?> cls = getScratchpadClass();
|
||||||
|
try {
|
||||||
|
Method m = cls.getDeclaredMethod(
|
||||||
|
"identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
|
||||||
|
m.invoke(null, ext, dirs, nonPOIFS);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create the extractors
|
// Create the extractors
|
||||||
|
@ -195,10 +244,10 @@ public class OLE2ExtractorFactory {
|
||||||
} catch (IllegalArgumentException ie) {
|
} catch (IllegalArgumentException ie) {
|
||||||
// Ignore, just means it didn't contain
|
// Ignore, just means it didn't contain
|
||||||
// a format we support as yet
|
// a format we support as yet
|
||||||
// TODO Should we log this?
|
LOGGER.log(POILogger.WARN, ie);
|
||||||
} catch (Exception xe) {
|
} catch (Exception xe) {
|
||||||
// Ignore, invalid format
|
// Ignore, invalid format
|
||||||
// TODO Should we log this?
|
LOGGER.log(POILogger.WARN, xe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return e.toArray(new POITextExtractor[e.size()]);
|
return e.toArray(new POITextExtractor[e.size()]);
|
||||||
|
|
|
@ -78,23 +78,13 @@ public class ExtractorFactory {
|
||||||
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
|
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
|
||||||
protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
|
protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
|
||||||
|
|
||||||
|
|
||||||
/** Should this thread prefer event based over usermodel based extractors? */
|
|
||||||
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
|
|
||||||
@Override
|
|
||||||
protected Boolean initialValue() { return Boolean.FALSE; }
|
|
||||||
};
|
|
||||||
|
|
||||||
/** Should all threads prefer event based over usermodel based extractors? */
|
|
||||||
private static Boolean allPreferEventExtractors;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Should this thread prefer event based over usermodel based extractors?
|
* Should this thread prefer event based over usermodel based extractors?
|
||||||
* (usermodel extractors tend to be more accurate, but use more memory)
|
* (usermodel extractors tend to be more accurate, but use more memory)
|
||||||
* Default is false.
|
* Default is false.
|
||||||
*/
|
*/
|
||||||
public static boolean getThreadPrefersEventExtractors() {
|
public static boolean getThreadPrefersEventExtractors() {
|
||||||
return threadPreferEventExtractors.get();
|
return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -103,7 +93,7 @@ public class ExtractorFactory {
|
||||||
* Default is to use the thread level setting, which defaults to false.
|
* Default is to use the thread level setting, which defaults to false.
|
||||||
*/
|
*/
|
||||||
public static Boolean getAllThreadsPreferEventExtractors() {
|
public static Boolean getAllThreadsPreferEventExtractors() {
|
||||||
return allPreferEventExtractors;
|
return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -111,7 +101,7 @@ public class ExtractorFactory {
|
||||||
* Will only be used if the All Threads setting is null.
|
* Will only be used if the All Threads setting is null.
|
||||||
*/
|
*/
|
||||||
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
|
||||||
threadPreferEventExtractors.set(preferEventExtractors);
|
OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -119,7 +109,7 @@ public class ExtractorFactory {
|
||||||
* If set, will take preference over the Thread level setting.
|
* If set, will take preference over the Thread level setting.
|
||||||
*/
|
*/
|
||||||
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
|
||||||
allPreferEventExtractors = preferEventExtractors;
|
OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -127,10 +117,7 @@ public class ExtractorFactory {
|
||||||
* Checks the all-threads one first, then thread specific.
|
* Checks the all-threads one first, then thread specific.
|
||||||
*/
|
*/
|
||||||
protected static boolean getPreferEventExtractor() {
|
protected static boolean getPreferEventExtractor() {
|
||||||
if(allPreferEventExtractors != null) {
|
return OLE2ExtractorFactory.getPreferEventExtractor();
|
||||||
return allPreferEventExtractors;
|
|
||||||
}
|
|
||||||
return threadPreferEventExtractors.get();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
|
||||||
|
@ -281,83 +268,28 @@ public class ExtractorFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
// Only ever an OLE2 one from the root of the FS
|
return OLE2ExtractorFactory.createExtractor(fs);
|
||||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
|
||||||
}
|
}
|
||||||
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
// Only ever an OLE2 one from the root of the FS
|
return OLE2ExtractorFactory.createExtractor(fs);
|
||||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
|
||||||
}
|
}
|
||||||
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
|
||||||
// Only ever an OLE2 one from the root of the FS
|
return OLE2ExtractorFactory.createExtractor(fs);
|
||||||
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
|
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
|
||||||
OpenXML4JException, XmlException
|
OpenXML4JException, XmlException
|
||||||
{
|
{
|
||||||
// Look for certain entries in the stream, to figure it
|
// First, check for OOXML
|
||||||
// out from
|
|
||||||
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
|
|
||||||
if (poifsDir.hasEntry(workbookName)) {
|
|
||||||
if (getPreferEventExtractor()) {
|
|
||||||
return new EventBasedExcelExtractor(poifsDir);
|
|
||||||
}
|
|
||||||
return new ExcelExtractor(poifsDir);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
|
|
||||||
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
|
|
||||||
+ "found. Please call OldExcelExtractor directly for basic text extraction");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (poifsDir.hasEntry("WordDocument")) {
|
|
||||||
// Old or new style word document?
|
|
||||||
try {
|
|
||||||
return new WordExtractor(poifsDir);
|
|
||||||
} catch (OldWordFileFormatException e) {
|
|
||||||
return new Word6Extractor(poifsDir);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (poifsDir.hasEntry("PowerPoint Document")) {
|
|
||||||
return new PowerPointExtractor(poifsDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (poifsDir.hasEntry("VisioDocument")) {
|
|
||||||
return new VisioTextExtractor(poifsDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (poifsDir.hasEntry("Quill")) {
|
|
||||||
return new PublisherTextExtractor(poifsDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
final String[] outlookEntryNames = new String[] {
|
|
||||||
// message bodies, saved as plain text (PtypString)
|
|
||||||
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
|
|
||||||
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
|
|
||||||
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
|
|
||||||
// @see org.apache.poi.hsmf.Types.MAPIType
|
|
||||||
"__substg1.0_1000001E", //PidTagBody ASCII
|
|
||||||
"__substg1.0_1000001F", //PidTagBody Unicode
|
|
||||||
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
|
|
||||||
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
|
|
||||||
"__substg1.0_0037001E", //PidTagSubject ASCII
|
|
||||||
"__substg1.0_0037001F", //PidTagSubject Unicode
|
|
||||||
};
|
|
||||||
for (String entryName : outlookEntryNames) {
|
|
||||||
if (poifsDir.hasEntry(entryName)) {
|
|
||||||
return new OutlookTextExtactor(poifsDir);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (String entryName : poifsDir.getEntryNames()) {
|
for (String entryName : poifsDir.getEntryNames()) {
|
||||||
if (entryName.equals("Package")) {
|
if (entryName.equals("Package")) {
|
||||||
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
|
||||||
return createExtractor(pkg);
|
return createExtractor(pkg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
|
|
||||||
|
// If not, ask the OLE2 code to check, with Scratchpad if possible
|
||||||
|
return OLE2ExtractorFactory.createExtractor(poifsDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -150,6 +150,7 @@ public class TestExtractorFactory {
|
||||||
|
|
||||||
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
|
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
|
extractor.getClass().getName(),
|
||||||
extractor
|
extractor
|
||||||
instanceof XSSFExcelExtractor
|
instanceof XSSFExcelExtractor
|
||||||
);
|
);
|
||||||
|
@ -163,6 +164,7 @@ public class TestExtractorFactory {
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(xltx);
|
extractor = ExtractorFactory.createExtractor(xltx);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
|
extractor.getClass().getName(),
|
||||||
extractor
|
extractor
|
||||||
instanceof XSSFExcelExtractor
|
instanceof XSSFExcelExtractor
|
||||||
);
|
);
|
||||||
|
@ -340,6 +342,7 @@ public class TestExtractorFactory {
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
|
||||||
assertTrue(
|
assertTrue(
|
||||||
|
extractor.getClass().getName(),
|
||||||
extractor
|
extractor
|
||||||
instanceof XSSFExcelExtractor
|
instanceof XSSFExcelExtractor
|
||||||
);
|
);
|
||||||
|
@ -359,6 +362,7 @@ public class TestExtractorFactory {
|
||||||
// Word
|
// Word
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
|
||||||
assertTrue(
|
assertTrue(
|
||||||
|
extractor.getClass().getName(),
|
||||||
extractor
|
extractor
|
||||||
instanceof WordExtractor
|
instanceof WordExtractor
|
||||||
);
|
);
|
||||||
|
@ -369,6 +373,7 @@ public class TestExtractorFactory {
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
|
||||||
assertTrue(
|
assertTrue(
|
||||||
|
extractor.getClass().getName(),
|
||||||
extractor
|
extractor
|
||||||
instanceof Word6Extractor
|
instanceof Word6Extractor
|
||||||
);
|
);
|
||||||
|
@ -379,6 +384,7 @@ public class TestExtractorFactory {
|
||||||
|
|
||||||
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
|
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
|
||||||
assertTrue(
|
assertTrue(
|
||||||
|
extractor.getClass().getName(),
|
||||||
extractor
|
extractor
|
||||||
instanceof Word6Extractor
|
instanceof Word6Extractor
|
||||||
);
|
);
|
||||||
|
|
|
@ -20,8 +20,8 @@ import java.io.ByteArrayInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.POITextExtractor;
|
import org.apache.poi.POITextExtractor;
|
||||||
|
@ -108,7 +108,7 @@ public class OLE2ScrachpadExtractorFactory {
|
||||||
* empty array. Otherwise, you'll get one open
|
* empty array. Otherwise, you'll get one open
|
||||||
* {@link POITextExtractor} for each embedded file.
|
* {@link POITextExtractor} for each embedded file.
|
||||||
*/
|
*/
|
||||||
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList<Entry> dirs, ArrayList<InputStream> nonPOIFS) throws IOException {
|
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
|
||||||
// Find all the embedded directories
|
// Find all the embedded directories
|
||||||
DirectoryEntry root = ext.getRoot();
|
DirectoryEntry root = ext.getRoot();
|
||||||
if(root == null) {
|
if(root == null) {
|
||||||
|
|
Loading…
Reference in New Issue