Start moving logic over into the main and scratchpad jars for OLE2

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1752226 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2016-07-11 22:47:02 +00:00
parent a5f19ab07f
commit ef2af2d53d
4 changed files with 79 additions and 92 deletions

View File

@ -20,8 +20,10 @@ import static org.apache.poi.hssf.model.InternalWorkbook.WORKBOOK_DIR_ENTRY_NAME
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
@ -33,6 +35,8 @@ import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/** /**
* Figures out the correct POIOLE2TextExtractor for your supplied * Figures out the correct POIOLE2TextExtractor for your supplied
@ -48,6 +52,8 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
*/ */
@SuppressWarnings("WeakerAccess") @SuppressWarnings("WeakerAccess")
public class OLE2ExtractorFactory { public class OLE2ExtractorFactory {
private static final POILogger LOGGER = POILogFactory.getLogger(OLE2ExtractorFactory.class);
/** Should this thread prefer event based over usermodel based extractors? */ /** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() { private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
@Override @Override
@ -115,10 +121,37 @@ public class OLE2ExtractorFactory {
return (POIOLE2TextExtractor)createExtractor(fs.getRoot()); return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
} }
public static POITextExtractor createExtractor(InputStream input) { public static POITextExtractor createExtractor(InputStream input) throws IOException {
// TODO Something nasty with reflection... Class<?> cls = getOOXMLClass();
if (cls != null) {
// TODO Reflection
throw new IllegalArgumentException("TODO Reflection");
} else {
// Best hope it's OLE2....
return createExtractor(new NPOIFSFileSystem(input));
}
}
private static Class<?> getOOXMLClass() {
try {
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
"org.apache.poi.extractor.ExtractorFactory"
);
} catch (ClassNotFoundException e) {
LOGGER.log(POILogger.WARN, "POI OOXML jar missing");
return null; return null;
} }
}
private static Class<?> getScratchpadClass() {
try {
return OLE2ExtractorFactory.class.getClassLoader().loadClass(
"org.apache.poi.extractor.OLE2ScrachpadExtractorFactory"
);
} catch (ClassNotFoundException e) {
LOGGER.log(POILogger.ERROR, "POI Scratchpad jar missing");
throw new IllegalStateException("POI Scratchpad jar missing, required for ExtractorFactory");
}
}
/** /**
* Create the Extractor, if possible. Generally needs the Scratchpad jar. * Create the Extractor, if possible. Generally needs the Scratchpad jar.
@ -139,7 +172,15 @@ public class OLE2ExtractorFactory {
} }
} }
// TODO Try to ask the Scratchpad // Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
if (ext != null) return ext;
} catch (Exception e) {
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream"); throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
} }
@ -155,9 +196,9 @@ public class OLE2ExtractorFactory {
throws IOException throws IOException
{ {
// All the embedded directories we spotted // All the embedded directories we spotted
ArrayList<Entry> dirs = new ArrayList<Entry>(); List<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory // For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>(); List<InputStream> nonPOIFS = new ArrayList<InputStream>();
// Find all the embedded directories // Find all the embedded directories
DirectoryEntry root = ext.getRoot(); DirectoryEntry root = ext.getRoot();
@ -175,7 +216,15 @@ public class OLE2ExtractorFactory {
} }
} }
} else { } else {
// TODO Ask scratchpad // Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod(
"identifyEmbeddedResources", POIOLE2TextExtractor.class, List.class, List.class);
m.invoke(null, ext, dirs, nonPOIFS);
} catch (Exception e) {
throw new IllegalArgumentException("Error checking for Scratchpad embedded resources", e);
}
} }
// Create the extractors // Create the extractors
@ -195,10 +244,10 @@ public class OLE2ExtractorFactory {
} catch (IllegalArgumentException ie) { } catch (IllegalArgumentException ie) {
// Ignore, just means it didn't contain // Ignore, just means it didn't contain
// a format we support as yet // a format we support as yet
// TODO Should we log this? LOGGER.log(POILogger.WARN, ie);
} catch (Exception xe) { } catch (Exception xe) {
// Ignore, invalid format // Ignore, invalid format
// TODO Should we log this? LOGGER.log(POILogger.WARN, xe);
} }
} }
return e.toArray(new POITextExtractor[e.size()]); return e.toArray(new POITextExtractor[e.size()]);

View File

@ -78,23 +78,13 @@ public class ExtractorFactory {
protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT;
protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT;
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
@Override
protected Boolean initialValue() { return Boolean.FALSE; }
};
/** Should all threads prefer event based over usermodel based extractors? */
private static Boolean allPreferEventExtractors;
/** /**
* Should this thread prefer event based over usermodel based extractors? * Should this thread prefer event based over usermodel based extractors?
* (usermodel extractors tend to be more accurate, but use more memory) * (usermodel extractors tend to be more accurate, but use more memory)
* Default is false. * Default is false.
*/ */
public static boolean getThreadPrefersEventExtractors() { public static boolean getThreadPrefersEventExtractors() {
return threadPreferEventExtractors.get(); return OLE2ExtractorFactory.getThreadPrefersEventExtractors();
} }
/** /**
@ -103,7 +93,7 @@ public class ExtractorFactory {
* Default is to use the thread level setting, which defaults to false. * Default is to use the thread level setting, which defaults to false.
*/ */
public static Boolean getAllThreadsPreferEventExtractors() { public static Boolean getAllThreadsPreferEventExtractors() {
return allPreferEventExtractors; return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors();
} }
/** /**
@ -111,7 +101,7 @@ public class ExtractorFactory {
* Will only be used if the All Threads setting is null. * Will only be used if the All Threads setting is null.
*/ */
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
threadPreferEventExtractors.set(preferEventExtractors); OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors);
} }
/** /**
@ -119,7 +109,7 @@ public class ExtractorFactory {
* If set, will take preference over the Thread level setting. * If set, will take preference over the Thread level setting.
*/ */
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
allPreferEventExtractors = preferEventExtractors; OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors);
} }
/** /**
@ -127,10 +117,7 @@ public class ExtractorFactory {
* Checks the all-threads one first, then thread specific. * Checks the all-threads one first, then thread specific.
*/ */
protected static boolean getPreferEventExtractor() { protected static boolean getPreferEventExtractor() {
if(allPreferEventExtractors != null) { return OLE2ExtractorFactory.getPreferEventExtractor();
return allPreferEventExtractors;
}
return threadPreferEventExtractors.get();
} }
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException { public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
@ -281,83 +268,28 @@ public class ExtractorFactory {
} }
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS return OLE2ExtractorFactory.createExtractor(fs);
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
} }
public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS return OLE2ExtractorFactory.createExtractor(fs);
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
} }
public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS return OLE2ExtractorFactory.createExtractor(fs);
return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
} }
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
OpenXML4JException, XmlException OpenXML4JException, XmlException
{ {
// Look for certain entries in the stream, to figure it // First, check for OOXML
// out from
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
if (poifsDir.hasEntry(workbookName)) {
if (getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir);
}
return new ExcelExtractor(poifsDir);
}
}
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
+ "found. Please call OldExcelExtractor directly for basic text extraction");
}
if (poifsDir.hasEntry("WordDocument")) {
// Old or new style word document?
try {
return new WordExtractor(poifsDir);
} catch (OldWordFileFormatException e) {
return new Word6Extractor(poifsDir);
}
}
if (poifsDir.hasEntry("PowerPoint Document")) {
return new PowerPointExtractor(poifsDir);
}
if (poifsDir.hasEntry("VisioDocument")) {
return new VisioTextExtractor(poifsDir);
}
if (poifsDir.hasEntry("Quill")) {
return new PublisherTextExtractor(poifsDir);
}
final String[] outlookEntryNames = new String[] {
// message bodies, saved as plain text (PtypString)
// The first short (0x1000, 0x0047, 0x0037) refer to the Property ID (see [MS-OXPROPS].pdf)
// the second short (0x001e, 0x001f, 0x0102) refer to the type of data stored in this entry
// https://msdn.microsoft.com/endatatypes.Ex-us/library/cc433490(v=exchg.80).aspx
// @see org.apache.poi.hsmf.Types.MAPIType
"__substg1.0_1000001E", //PidTagBody ASCII
"__substg1.0_1000001F", //PidTagBody Unicode
"__substg1.0_0047001E", //PidTagMessageSubmissionId ASCII
"__substg1.0_0047001F", //PidTagMessageSubmissionId Unicode
"__substg1.0_0037001E", //PidTagSubject ASCII
"__substg1.0_0037001F", //PidTagSubject Unicode
};
for (String entryName : outlookEntryNames) {
if (poifsDir.hasEntry(entryName)) {
return new OutlookTextExtactor(poifsDir);
}
}
for (String entryName : poifsDir.getEntryNames()) { for (String entryName : poifsDir.getEntryNames()) {
if (entryName.equals("Package")) { if (entryName.equals("Package")) {
OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
return createExtractor(pkg); return createExtractor(pkg);
} }
} }
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
// If not, ask the OLE2 code to check, with Scratchpad if possible
return OLE2ExtractorFactory.createExtractor(poifsDir);
} }
/** /**

View File

@ -150,6 +150,7 @@ public class TestExtractorFactory {
POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx); POITextExtractor extractor = ExtractorFactory.createExtractor(xlsx);
assertTrue( assertTrue(
extractor.getClass().getName(),
extractor extractor
instanceof XSSFExcelExtractor instanceof XSSFExcelExtractor
); );
@ -163,6 +164,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(xltx); extractor = ExtractorFactory.createExtractor(xltx);
assertTrue( assertTrue(
extractor.getClass().getName(),
extractor extractor
instanceof XSSFExcelExtractor instanceof XSSFExcelExtractor
); );
@ -340,6 +342,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx)); extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
assertTrue( assertTrue(
extractor.getClass().getName(),
extractor extractor
instanceof XSSFExcelExtractor instanceof XSSFExcelExtractor
); );
@ -359,6 +362,7 @@ public class TestExtractorFactory {
// Word // Word
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc)); extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
assertTrue( assertTrue(
extractor.getClass().getName(),
extractor extractor
instanceof WordExtractor instanceof WordExtractor
); );
@ -369,6 +373,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6)); extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
assertTrue( assertTrue(
extractor.getClass().getName(),
extractor extractor
instanceof Word6Extractor instanceof Word6Extractor
); );
@ -379,6 +384,7 @@ public class TestExtractorFactory {
extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95)); extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
assertTrue( assertTrue(
extractor.getClass().getName(),
extractor extractor
instanceof Word6Extractor instanceof Word6Extractor
); );

View File

@ -20,8 +20,8 @@ import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
@ -108,7 +108,7 @@ public class OLE2ScrachpadExtractorFactory {
* empty array. Otherwise, you'll get one open * empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embedded file. * {@link POITextExtractor} for each embedded file.
*/ */
public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, ArrayList<Entry> dirs, ArrayList<InputStream> nonPOIFS) throws IOException { public static void identifyEmbeddedResources(POIOLE2TextExtractor ext, List<Entry> dirs, List<InputStream> nonPOIFS) throws IOException {
// Find all the embedded directories // Find all the embedded directories
DirectoryEntry root = ext.getRoot(); DirectoryEntry root = ext.getRoot();
if(root == null) { if(root == null) {