60279 -- back off to brute-force search for macro content if offset information is not correct

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1808301 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-09-14 02:22:55 +00:00
parent 23ecb9a172
commit 76d3f15560
4 changed files with 386 additions and 110 deletions

View File

@ -43,7 +43,9 @@ import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.HexDump;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.RLEDecompressingInputStream;
import org.apache.poi.util.StringUtil;
/**
* <p>Finds all VBA Macros in an office file (OLE2/POIFS and OOXML/OPC),
@ -61,9 +63,7 @@ import org.apache.poi.util.RLEDecompressingInputStream;
public class VBAMacroReader implements Closeable {
protected static final String VBA_PROJECT_OOXML = "vbaProject.bin";
protected static final String VBA_PROJECT_POIFS = "VBA";
// FIXME: When minimum supported version is Java 7, replace with java.nio.charset.StandardCharsets.UTF_16LE
private static final Charset UTF_16LE = Charset.forName("UTF-16LE");
private NPOIFSFileSystem fs;
public VBAMacroReader(InputStream rstream) throws IOException {
@ -145,7 +145,7 @@ public class VBAMacroReader implements Closeable {
}
}
protected static class ModuleMap extends HashMap<String, Module> {
Charset charset = Charset.forName("Cp1252"); // default charset
Charset charset = StringUtil.WIN_1252; // default charset
}
/**
@ -172,20 +172,7 @@ public class VBAMacroReader implements Closeable {
}
}
/**
* Read <tt>length</tt> bytes of MBCS (multi-byte character set) characters from the stream
*
* @param stream the inputstream to read from
* @param length number of bytes to read from stream
* @param charset the character set encoding of the bytes in the stream
* @return a java String in the supplied character set
* @throws IOException If reading from the stream fails
*/
private static String readString(InputStream stream, int length, Charset charset) throws IOException {
byte[] buffer = new byte[length];
int count = stream.read(buffer);
return new String(buffer, 0, count, charset);
}
/**
* reads module from DIR node in input stream and adds it to the modules map for decompression later
@ -199,7 +186,7 @@ public class VBAMacroReader implements Closeable {
* @param modules a map to store the modules
* @throws IOException If reading data from the stream or from modules fails
*/
private static void readModule(RLEDecompressingInputStream in, String streamName, ModuleMap modules) throws IOException {
private static void readModuleMetadataFromDirStream(RLEDecompressingInputStream in, String streamName, ModuleMap modules) throws IOException {
int moduleOffset = in.readInt();
Module module = modules.get(streamName);
if (module == null) {
@ -218,27 +205,57 @@ public class VBAMacroReader implements Closeable {
}
}
private static void readModule(DocumentInputStream dis, String name, ModuleMap modules) throws IOException {
private static void readModuleFromDocumentStream(DocumentNode documentNode, String name, ModuleMap modules) throws IOException {
Module module = modules.get(name);
// TODO Refactor this to fetch dir then do the rest
if (module == null) {
// no DIR stream with offsets yet, so store the compressed bytes for later
module = new Module();
modules.put(name, module);
module.read(dis);
InputStream dis = new DocumentInputStream(documentNode);
try {
module.read(dis);
} finally {
dis.close();
}
} else if (module.buf == null) { //if we haven't already read the bytes for the module keyed off this name...
if (module.offset == null) {
//This should not happen. bug 59858
throw new IOException("Module offset for '" + name + "' was never read.");
}
// we know the offset already, so decompress immediately on-the-fly
long skippedBytes = dis.skip(module.offset);
if (skippedBytes != module.offset) {
throw new IOException("tried to skip " + module.offset + " bytes, but actually skipped " + skippedBytes + " bytes");
//try the general case, where module.offset is accurate
InputStream decompressed = null;
InputStream compressed = new DocumentInputStream(documentNode);
try {
// we know the offset already, so decompress immediately on-the-fly
long skippedBytes = compressed.skip(module.offset);
if (skippedBytes != module.offset) {
throw new IOException("tried to skip " + module.offset + " bytes, but actually skipped " + skippedBytes + " bytes");
}
decompressed = new RLEDecompressingInputStream(compressed);
module.read(decompressed);
return;
} catch (IllegalArgumentException e) {
} catch (IllegalStateException e) {
} finally {
IOUtils.closeQuietly(compressed);
IOUtils.closeQuietly(decompressed);
}
//bad module.offset, try brute force
compressed = new DocumentInputStream(documentNode);
byte[] decompressedBytes = null;
try {
decompressedBytes = findCompressedStreamWBruteForce(compressed);
} finally {
IOUtils.closeQuietly(compressed);
}
if (decompressedBytes != null) {
module.read(new ByteArrayInputStream(decompressedBytes));
}
InputStream stream = new RLEDecompressingInputStream(dis);
module.read(stream);
stream.close();
}
}
@ -249,7 +266,7 @@ public class VBAMacroReader implements Closeable {
* @throws IOException If skipping would exceed the available data or skipping did not work.
*/
private static void trySkip(InputStream in, long n) throws IOException {
long skippedBytes = in.skip(n);
long skippedBytes = IOUtils.skipFully(in, n);
if (skippedBytes != n) {
if (skippedBytes < 0) {
throw new IOException(
@ -258,33 +275,18 @@ public class VBAMacroReader implements Closeable {
} else {
throw new IOException(
"Tried skipping " + n + " bytes, but only " + skippedBytes + " bytes were skipped. "
+ "This should never happen.");
+ "This should never happen with a non-corrupt file.");
}
}
}
// Constants from MS-OVBA: https://msdn.microsoft.com/en-us/library/office/cc313094(v=office.12).aspx
private static final int EOF = -1;
private static final int VERSION_INDEPENDENT_TERMINATOR = 0x0010;
@SuppressWarnings("unused")
private static final int VERSION_DEPENDENT_TERMINATOR = 0x002B;
private static final int PROJECTVERSION = 0x0009;
private static final int PROJECTCODEPAGE = 0x0003;
private static final int STREAMNAME = 0x001A;
private static final int MODULEOFFSET = 0x0031;
@SuppressWarnings("unused")
private static final int MODULETYPE_PROCEDURAL = 0x0021;
@SuppressWarnings("unused")
private static final int MODULETYPE_DOCUMENT_CLASS_OR_DESIGNER = 0x0022;
@SuppressWarnings("unused")
private static final int PROJECTLCID = 0x0002;
@SuppressWarnings("unused")
private static final int MODULE_NAME = 0x0019;
@SuppressWarnings("unused")
private static final int MODULE_NAME_UNICODE = 0x0047;
@SuppressWarnings("unused")
private static final int MODULE_DOC_STRING = 0x001c;
private static final int STREAMNAME_RESERVED = 0x0032;
private static final int PROJECT_CONSTANTS_RESERVED = 0x003C;
private static final int HELP_FILE_PATH_RESERVED = 0x003D;
private static final int REFERENCE_NAME_RESERVED = 0x003E;
private static final int DOC_STRING_RESERVED = 0x0040;
private static final int MODULE_DOCSTRING_RESERVED = 0x0048;
/**
* Reads VBA Project modules from a VBA Project directory located at
@ -293,76 +295,330 @@ public class VBAMacroReader implements Closeable {
* @since 3.15-beta2
*/
protected void readMacros(DirectoryNode macroDir, ModuleMap modules) throws IOException {
//bug59858 shows that dirstream may not be in this directory (\MBD00082648\_VBA_PROJECT_CUR\VBA ENTRY NAME)
//but may be in another directory (\_VBA_PROJECT_CUR\VBA ENTRY NAME)
//process the dirstream first -- "dir" is case insensitive
for (String entryName : macroDir.getEntryNames()) {
if ("dir".equalsIgnoreCase(entryName)) {
processDirStream(macroDir.getEntry(entryName), modules);
break;
}
}
for (Entry entry : macroDir) {
if (! (entry instanceof DocumentNode)) { continue; }
String name = entry.getName();
DocumentNode document = (DocumentNode)entry;
DocumentInputStream dis = new DocumentInputStream(document);
try {
if ("dir".equalsIgnoreCase(name)) {
// process DIR
RLEDecompressingInputStream in = new RLEDecompressingInputStream(dis);
String streamName = null;
int recordId = 0;
try {
while (true) {
recordId = in.readShort();
if (EOF == recordId
|| VERSION_INDEPENDENT_TERMINATOR == recordId) {
break;
}
int recordLength = in.readInt();
switch (recordId) {
case PROJECTVERSION:
trySkip(in, 6);
break;
case PROJECTCODEPAGE:
int codepage = in.readShort();
modules.charset = Charset.forName(CodePageUtil.codepageToEncoding(codepage, true));
break;
case STREAMNAME:
streamName = readString(in, recordLength, modules.charset);
int reserved = in.readShort();
if (reserved != STREAMNAME_RESERVED) {
throw new IOException("Expected x0032 after stream name before Unicode stream name, but found: "+
Integer.toHexString(reserved));
}
int unicodeNameRecordLength = in.readInt();
readUnicodeString(in, unicodeNameRecordLength);
// do something with this at some point
break;
case MODULEOFFSET:
readModule(in, streamName, modules);
break;
default:
trySkip(in, recordLength);
break;
}
}
} catch (final IOException e) {
throw new IOException(
"Error occurred while reading macros at section id "
+ recordId + " (" + HexDump.shortToHex(recordId) + ")", e);
}
finally {
in.close();
}
} else if (!startsWithIgnoreCase(name, "__SRP")
if (! "dir".equalsIgnoreCase(name) && !startsWithIgnoreCase(name, "__SRP")
&& !startsWithIgnoreCase(name, "_VBA_PROJECT")) {
// process module, skip __SRP and _VBA_PROJECT since these do not contain macros
readModule(dis, name, modules);
}
}
finally {
dis.close();
readModuleFromDocumentStream(document, name, modules);
}
}
}
private enum RecordType {
// Constants from MS-OVBA: https://msdn.microsoft.com/en-us/library/office/cc313094(v=office.12).aspx
MODULE_OFFSET(0x0031),
PROJECT_SYS_KIND(0x01),
PROJECT_LCID(0x0002),
PROJECT_LCID_INVOKE(0x14),
PROJECT_CODEPAGE(0x0003),
PROJECT_NAME(0x04),
PROJECT_DOC_STRING(0x05),
PROJECT_HELP_FILE_PATH(0x06),
PROJECT_HELP_CONTEXT(0x07, 8),
PROJECT_LIB_FLAGS(0x08),
PROJECT_VERSION(0x09, 10),
PROJECT_CONSTANTS(0x0C),
PROJECT_MODULES(0x0F),
DIR_STREAM_TERMINATOR(0x10),
PROJECT_COOKIE(0x13),
MODULE_NAME(0x19),
MODULE_NAME_UNICODE(0x47),
MODULE_STREAM_NAME(0x1A),
MODULE_DOC_STRING(0x1C),
MODULE_HELP_CONTEXT(0x1E),
MODULE_COOKIE(0x2c),
MODULE_TYPE_PROCEDURAL(0x21, 4),
MODULE_TYPE_OTHER(0x22, 4),
MODULE_PRIVATE(0x28, 4),
REFERENCE_NAME(0x16),
REFERENCE_REGISTERED(0x0D),
REFERENCE_PROJECT(0x0E),
REFERENCE_CONTROL_A(0x2F),
//according to the spec, REFERENCE_CONTROL_B(0x33) should have the
//same structure as REFERENCE_CONTROL_A(0x2F).
//However, it seems to have the int(length) record structure that most others do.
//See 59830.xls for this record.
REFERENCE_CONTROL_B(0x33),
//REFERENCE_ORIGINAL(0x33),
MODULE_TERMINATOR(0x002B),
EOF(-1),
UNKNOWN(-2);
private final int VARIABLE_LENGTH = -1;
private final int id;
private final int constantLength;
RecordType(int id) {
this.id = id;
this.constantLength = VARIABLE_LENGTH;
}
RecordType(int id, int constantLength) {
this.id = id;
this.constantLength = constantLength;
}
int getConstantLength() {
return constantLength;
}
static RecordType lookup(int id) {
for (RecordType type : RecordType.values()) {
if (type.id == id) {
return type;
}
}
return UNKNOWN;
}
}
private enum DIR_STATE {
INFORMATION_RECORD,
REFERENCES_RECORD,
MODULES_RECORD
}
private static class ASCIIUnicodeStringPair {
private final String ascii;
private final String unicode;
ASCIIUnicodeStringPair(String ascii, String unicode) {
this.ascii = ascii;
this.unicode = unicode;
}
private String getAscii() {
return ascii;
}
private String getUnicode() {
return unicode;
}
}
private void processDirStream(Entry dir, ModuleMap modules) throws IOException {
DocumentNode dirDocumentNode = (DocumentNode)dir;
DocumentInputStream dis = new DocumentInputStream(dirDocumentNode);
DIR_STATE dirState = DIR_STATE.INFORMATION_RECORD;
try {
RLEDecompressingInputStream in = new RLEDecompressingInputStream(dis);
String streamName = null;
int recordId = 0;
boolean inReferenceTwiddled = false;
try {
while (true) {
recordId = in.readShort();
if (recordId == -1) {
break;
}
RecordType type = RecordType.lookup(recordId);
if (type.equals(RecordType.EOF) || type.equals(RecordType.DIR_STREAM_TERMINATOR)) {
break;
}
switch (type) {
case PROJECT_VERSION:
trySkip(in, RecordType.PROJECT_VERSION.getConstantLength());
break;
case PROJECT_CODEPAGE:
in.readInt();//record size must == 4
int codepage = in.readShort();
modules.charset = Charset.forName(CodePageUtil.codepageToEncoding(codepage, true));
break;
case MODULE_STREAM_NAME:
ASCIIUnicodeStringPair pair = readStringPair(in, modules.charset, STREAMNAME_RESERVED);
streamName = pair.getAscii();
break;
case PROJECT_DOC_STRING:
readStringPair(in, modules.charset, DOC_STRING_RESERVED);
break;
case PROJECT_HELP_FILE_PATH:
readStringPair(in, modules.charset, HELP_FILE_PATH_RESERVED);
break;
case PROJECT_CONSTANTS:
readStringPair(in, modules.charset, PROJECT_CONSTANTS_RESERVED);
break;
case REFERENCE_NAME:
if (dirState.equals(DIR_STATE.INFORMATION_RECORD)) {
dirState = DIR_STATE.REFERENCES_RECORD;
}
readStringPair(in, modules.charset, REFERENCE_NAME_RESERVED);
break;
case MODULE_DOC_STRING :
int modDocStringLength = in.readInt();
readString(in, modDocStringLength, modules.charset);
int modDocStringReserved = in.readShort();
if (modDocStringReserved != MODULE_DOCSTRING_RESERVED) {
throw new IOException("Expected x003C after stream name before Unicode stream name, but found: " +
Integer.toHexString(modDocStringReserved));
}
int unicodeModDocStringLength = in.readInt();
readUnicodeString(in, unicodeModDocStringLength);
// do something with this at some point
break;
case MODULE_OFFSET:
int modOffsetSz = in.readInt();
//should be 4
readModuleMetadataFromDirStream(in, streamName, modules);
break;
case PROJECT_MODULES:
dirState = DIR_STATE.MODULES_RECORD;
in.readInt();//size must == 2
in.readShort();//number of modules
break;
case REFERENCE_CONTROL_A:
int szTwiddled = in.readInt();
trySkip(in, szTwiddled);
int nextRecord = in.readShort();
//reference name is optional!
if (nextRecord == RecordType.REFERENCE_NAME.id) {
readStringPair(in, modules.charset, REFERENCE_NAME_RESERVED);
nextRecord = in.readShort();
}
if (nextRecord != 0x30) {
throw new IOException("Expected 0x30 as Reserved3 in a ReferenceControl record");
}
int szExtended = in.readInt();
trySkip(in, szExtended);
break;
case MODULE_TERMINATOR:
int endOfModulesReserved = in.readInt();
//must be 0;
break;
default:
if (type.getConstantLength() > -1) {
trySkip(in, type.getConstantLength());
} else {
int recordLength = in.readInt();
trySkip(in, recordLength);
}
break;
}
}
} catch (final IOException e) {
throw new IOException(
"Error occurred while reading macros at section id "
+ recordId + " (" + HexDump.shortToHex(recordId) + ")", e);
} finally {
in.close();
}
} finally {
dis.close();
}
}
private ASCIIUnicodeStringPair readStringPair(RLEDecompressingInputStream in, Charset charset, int reservedByte) throws IOException {
int nameLength = in.readInt();
String ascii = readString(in, nameLength, charset);
int reserved = in.readShort();
if (reserved != reservedByte) {
throw new IOException("Expected "+Integer.toHexString(reservedByte)+ "after name before Unicode name, but found: " +
Integer.toHexString(reserved));
}
int unicodeNameRecordLength = in.readInt();
String unicode = readUnicodeString(in, unicodeNameRecordLength);
return new ASCIIUnicodeStringPair(ascii, unicode);
}
/**
* Read <tt>length</tt> bytes of MBCS (multi-byte character set) characters from the stream
*
* @param stream the inputstream to read from
* @param length number of bytes to read from stream
* @param charset the character set encoding of the bytes in the stream
* @return a java String in the supplied character set
* @throws IOException If reading from the stream fails
*/
private static String readString(InputStream stream, int length, Charset charset) throws IOException {
byte[] buffer = IOUtils.safelyAllocate(length, 20000);
int bytesRead = IOUtils.readFully(stream, buffer);
if (bytesRead != length) {
throw new IOException("Tried to read: "+length +
", but could only read: "+bytesRead);
}
return new String(buffer, 0, length, charset);
}
private String readUnicodeString(RLEDecompressingInputStream in, int unicodeNameRecordLength) throws IOException {
byte[] buffer = new byte[unicodeNameRecordLength];
IOUtils.readFully(in, buffer);
return new String(buffer, UTF_16LE);
byte[] buffer = IOUtils.safelyAllocate(unicodeNameRecordLength, 20000);
int bytesRead = IOUtils.readFully(in, buffer);
if (bytesRead != unicodeNameRecordLength) {
}
return new String(buffer, StringUtil.UTF16LE);
}
/**
* Sometimes the offset record in the dirstream is incorrect, but the macro can still be found.
* This will try to find the the first RLEDecompressing stream that starts with "Attribute".
* This relies on some, er, heuristics, admittedly.
*
* @param is full module inputstream to read
* @return uncompressed bytes if found, <code>null</code> otherwise
* @throws IOException for a true IOException copying the is to a byte array
*/
private static byte[] findCompressedStreamWBruteForce(InputStream is) throws IOException {
//buffer to memory for multiple tries
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
byte[] compressed = bos.toByteArray();
byte[] decompressed = null;
for (int i = 0; i < compressed.length; i++) {
if (compressed[i] == 0x01 && i < compressed.length-1) {
int w = LittleEndian.getUShort(compressed, i+1);
if (w <= 0 || (w & 0x7000) != 0x3000) {
continue;
}
decompressed = tryToDecompress(new ByteArrayInputStream(compressed, i, compressed.length - i));
if (decompressed != null) {
if (decompressed.length > 9) {
//this is a complete hack. The challenge is that there
//can be many 0 length or junk streams that are uncompressed
//look in the first 20 characters for "Attribute"
int firstX = Math.min(20, decompressed.length);
String start = new String(decompressed, 0, firstX, StringUtil.WIN_1252);
if (start.contains("Attribute")) {
return decompressed;
}
}
}
}
}
return decompressed;
}
private static byte[] tryToDecompress(InputStream is) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try {
IOUtils.copy(new RLEDecompressingInputStream(is), bos);
} catch (IllegalArgumentException e){
return null;
} catch (IllegalStateException e) {
return null;
} catch (IOException e) {
return null;
}
return bos.toByteArray();
}
}

View File

@ -156,7 +156,7 @@ public class RLEDecompressingInputStream extends InputStream {
private int readChunk() throws IOException {
pos = 0;
int w = readShort(in);
if (w == -1) {
if (w == -1 || w == 0) {
return -1;
}
int chunkSize = (w & 0x0FFF) + 1; // plus 3 bytes minus 2 for the length

View File

@ -33,6 +33,7 @@ import java.util.HashMap;
import java.util.Map;
import static org.apache.poi.POITestCase.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
@ -251,6 +252,7 @@ public class TestVBAMacroReader {
File f = POIDataSamples.getSpreadSheetInstance().getFile("59830.xls");
VBAMacroReader r = new VBAMacroReader(f);
Map<String, String> macros = r.readMacros();
assertEquals(29, macros.size());
assertNotNull(macros.get("Module20"));
assertContains(macros.get("Module20"), "here start of superscripting");
r.close();
@ -261,6 +263,7 @@ public class TestVBAMacroReader {
File f = POIDataSamples.getSpreadSheetInstance().getFile("59858.xls");
VBAMacroReader r = new VBAMacroReader(f);
Map<String, String> macros = r.readMacros();
assertEquals(11, macros.size());
assertNotNull(macros.get("Sheet4"));
assertContains(macros.get("Sheet4"), "intentional constituent");
r.close();
@ -271,6 +274,7 @@ public class TestVBAMacroReader {
File f = POIDataSamples.getDocumentInstance().getFile("60158.docm");
VBAMacroReader r = new VBAMacroReader(f);
Map<String, String> macros = r.readMacros();
assertEquals(2, macros.size());
assertNotNull(macros.get("NewMacros"));
assertContains(macros.get("NewMacros"), "' dirty");
r.close();
@ -282,8 +286,24 @@ public class TestVBAMacroReader {
File f = POIDataSamples.getSpreadSheetInstance().getFile("60273.xls");
VBAMacroReader r = new VBAMacroReader(f);
Map<String, String> macros = r.readMacros();
assertEquals(2, macros.size());
assertNotNull(macros.get("Module1"));
assertContains(macros.get("Module1"), "9/8/2004");
r.close();
}
@Test
public void bug60279() throws IOException {
File f = POIDataSamples.getDocumentInstance().getFile("60279.doc");
VBAMacroReader r = new VBAMacroReader(f);
Map<String, String> macros = r.readMacros();
assertEquals(1, macros.size());
String content = macros.get("ThisDocument");
assertContains(content, "Attribute VB_Base = \"1Normal.ThisDocument\"");
assertContains(content, "Attribute VB_Customizable = True");
r.close();
}
}

Binary file not shown.