Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@951034 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2010-06-03 15:33:54 +00:00
parent 9c68267a68
commit 93f034976a
6 changed files with 70 additions and 3 deletions

View File

@ -34,6 +34,7 @@
<changes>
<release version="3.7-SNAPSHOT" date="2010-??-??">
<action dev="POI-DEVELOPERS" type="add">Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time formats like HH:MM;HH:MM</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
<action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>

View File

@ -282,6 +282,17 @@ public class MAPIMessage extends POIDocument {
return names;
}
/**
*
*/
public String[] getHeaders() throws ChunkNotFoundException {
String headers = getStringFromChunk(mainChunks.messageHeaders);
if(headers == null) {
return null;
}
return headers.split("\\r?\\n");
}
/**
* Gets the conversation topic of the parsed Outlook Message.

View File

@ -35,6 +35,7 @@ public final class Chunks implements ChunkGroup {
// 0x0050 -> 0x006F seem to be routing info or similar
public static final int CONVERSATION_TOPIC = 0x0070;
public static final int SENT_BY_SERVER_TYPE = 0x0075;
public static final int MESSAGE_HEADERS = 0x007D;
// RECEIVEDEMAIL = 76
public static final int DISPLAY_TO = 0x0E04;
public static final int DISPLAY_FROM = 0x0C1A;
@ -66,6 +67,8 @@ public final class Chunks implements ChunkGroup {
public StringChunk conversationTopic;
/** Type of server that the message originated from (SMTP, etc). */
public StringChunk sentByServerType;
/** The email headers */
public StringChunk messageHeaders;
/** TODO */
public MessageSubmissionChunk submissionChunk;
/** TODO */
@ -104,6 +107,9 @@ public final class Chunks implements ChunkGroup {
case SENT_BY_SERVER_TYPE:
sentByServerType = (StringChunk)chunk;
break;
case MESSAGE_HEADERS:
messageHeaders = (StringChunk)chunk;
break;
case DISPLAY_TO:
displayToChunk = (StringChunk)chunk;
break;

View File

@ -87,10 +87,30 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
} catch(ChunkNotFoundException e) {}
// Date - try two ways to find it
try {
// First try via the proper chunk
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
} catch(ChunkNotFoundException e) {}
} catch(ChunkNotFoundException e) {
try {
// Failing that try via the raw headers
String[] headers = msg.getHeaders();
for(String header: headers) {
if(header.toLowerCase().startsWith("date:")) {
s.append(
"Date:" +
header.substring(header.indexOf(':')+1) +
"\n"
);
break;
}
}
} catch(ChunkNotFoundException he) {
// We can't find the date, sorry...
}
}
try {
s.append("Subject: " + msg.getSubject() + "\n");
} catch(ChunkNotFoundException e) {}

View File

@ -75,6 +75,35 @@ public final class TestBasics extends TestCase {
assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject());
}
/**
* Test message headers
*/
public void testHeaders() throws Exception {
// Simple email first
assertEquals(26, simple.getHeaders().length);
assertTrue(simple.getHeaders()[0].startsWith("Return-path:"));
assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com"));
assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean"));
// Quick doesn't have them
try {
quick.getHeaders();
fail();
} catch(ChunkNotFoundException e) {}
// Attachments doesn't have them
try {
attachments.getHeaders();
fail();
} catch(ChunkNotFoundException e) {}
// Outlook30 has some
assertEquals(33, outlook30.getHeaders().length);
assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers"));
assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:"));
assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future
}
/**
* Test attachments
*/

View File

@ -84,7 +84,7 @@ public final class TestOutlookTextExtractor extends TestCase {
assertEquals(-1, text.indexOf("CC:"));
assertEquals(-1, text.indexOf("BCC:"));
assertContains(text, "Subject: test message\n");
assertEquals(-1, text.indexOf("Date:"));
assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n");
assertContains(text, "This is a test message.");
}
@ -171,7 +171,7 @@ public final class TestOutlookTextExtractor extends TestCase {
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
assertEquals(-1, text.indexOf("BCC:"));
assertContains(text, "Subject: This is a test message please ignore\n");
assertEquals(-1, text.indexOf("Date:"));
assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n");
assertContains(text, "The quick brown fox jumps over the lazy dog");
}
}