mirror of https://github.com/apache/poi.git
Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@951034 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9c68267a68
commit
93f034976a
|
@ -34,6 +34,7 @@
|
|||
|
||||
<changes>
|
||||
<release version="3.7-SNAPSHOT" date="2010-??-??">
|
||||
<action dev="POI-DEVELOPERS" type="add">Parse the HSMF headers chunk if present, and use it to find Dates in text extraction if needed</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48494 - detect and support time formats like HH:MM;HH:MM</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48494 - have ExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
|
||||
<action dev="POI-DEVELOPERS" type="fix">48494 - have EventBasedExcelExtractor make use of HSSFDataFormatter, so that numbers and dates come out closer to how Excel would render them</action>
|
||||
|
|
|
@ -282,6 +282,17 @@ public class MAPIMessage extends POIDocument {
|
|||
return names;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public String[] getHeaders() throws ChunkNotFoundException {
|
||||
String headers = getStringFromChunk(mainChunks.messageHeaders);
|
||||
if(headers == null) {
|
||||
return null;
|
||||
}
|
||||
return headers.split("\\r?\\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the conversation topic of the parsed Outlook Message.
|
||||
|
|
|
@ -35,6 +35,7 @@ public final class Chunks implements ChunkGroup {
|
|||
// 0x0050 -> 0x006F seem to be routing info or similar
|
||||
public static final int CONVERSATION_TOPIC = 0x0070;
|
||||
public static final int SENT_BY_SERVER_TYPE = 0x0075;
|
||||
public static final int MESSAGE_HEADERS = 0x007D;
|
||||
// RECEIVEDEMAIL = 76
|
||||
public static final int DISPLAY_TO = 0x0E04;
|
||||
public static final int DISPLAY_FROM = 0x0C1A;
|
||||
|
@ -66,6 +67,8 @@ public final class Chunks implements ChunkGroup {
|
|||
public StringChunk conversationTopic;
|
||||
/** Type of server that the message originated from (SMTP, etc). */
|
||||
public StringChunk sentByServerType;
|
||||
/** The email headers */
|
||||
public StringChunk messageHeaders;
|
||||
/** TODO */
|
||||
public MessageSubmissionChunk submissionChunk;
|
||||
/** TODO */
|
||||
|
@ -104,6 +107,9 @@ public final class Chunks implements ChunkGroup {
|
|||
case SENT_BY_SERVER_TYPE:
|
||||
sentByServerType = (StringChunk)chunk;
|
||||
break;
|
||||
case MESSAGE_HEADERS:
|
||||
messageHeaders = (StringChunk)chunk;
|
||||
break;
|
||||
case DISPLAY_TO:
|
||||
displayToChunk = (StringChunk)chunk;
|
||||
break;
|
||||
|
|
|
@ -87,10 +87,30 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
|
|||
handleEmails(s, "BCC", msg.getDisplayBCC(), emails);
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
|
||||
// Date - try two ways to find it
|
||||
try {
|
||||
// First try via the proper chunk
|
||||
SimpleDateFormat f = new SimpleDateFormat("E, d MMM yyyy HH:mm:ss");
|
||||
s.append("Date: " + f.format(msg.getMessageDate().getTime()) + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
} catch(ChunkNotFoundException e) {
|
||||
try {
|
||||
// Failing that try via the raw headers
|
||||
String[] headers = msg.getHeaders();
|
||||
for(String header: headers) {
|
||||
if(header.toLowerCase().startsWith("date:")) {
|
||||
s.append(
|
||||
"Date:" +
|
||||
header.substring(header.indexOf(':')+1) +
|
||||
"\n"
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch(ChunkNotFoundException he) {
|
||||
// We can't find the date, sorry...
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
s.append("Subject: " + msg.getSubject() + "\n");
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
|
|
|
@ -75,6 +75,35 @@ public final class TestBasics extends TestCase {
|
|||
assertEquals("test pi\u00e8ce jointe 1", attachments.getSubject());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test message headers
|
||||
*/
|
||||
public void testHeaders() throws Exception {
|
||||
// Simple email first
|
||||
assertEquals(26, simple.getHeaders().length);
|
||||
assertTrue(simple.getHeaders()[0].startsWith("Return-path:"));
|
||||
assertTrue(simple.getHeaders()[1].equals("Envelope-to: travis@overwrittenstack.com"));
|
||||
assertTrue(simple.getHeaders()[25].startsWith("X-Antivirus-Scanner: Clean"));
|
||||
|
||||
// Quick doesn't have them
|
||||
try {
|
||||
quick.getHeaders();
|
||||
fail();
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
|
||||
// Attachments doesn't have them
|
||||
try {
|
||||
attachments.getHeaders();
|
||||
fail();
|
||||
} catch(ChunkNotFoundException e) {}
|
||||
|
||||
// Outlook30 has some
|
||||
assertEquals(33, outlook30.getHeaders().length);
|
||||
assertTrue(outlook30.getHeaders()[0].startsWith("Microsoft Mail Internet Headers"));
|
||||
assertTrue(outlook30.getHeaders()[1].startsWith("x-mimeole:"));
|
||||
assertTrue(outlook30.getHeaders()[32].startsWith("\t\"Williams")); // May need better parsing in future
|
||||
}
|
||||
|
||||
/**
|
||||
* Test attachments
|
||||
*/
|
||||
|
|
|
@ -84,7 +84,7 @@ public final class TestOutlookTextExtractor extends TestCase {
|
|||
assertEquals(-1, text.indexOf("CC:"));
|
||||
assertEquals(-1, text.indexOf("BCC:"));
|
||||
assertContains(text, "Subject: test message\n");
|
||||
assertEquals(-1, text.indexOf("Date:"));
|
||||
assertContains(text, "Date: Fri, 6 Jul 2007 01:27:17 -0400\n");
|
||||
assertContains(text, "This is a test message.");
|
||||
}
|
||||
|
||||
|
@ -171,7 +171,7 @@ public final class TestOutlookTextExtractor extends TestCase {
|
|||
"nick.burch@alfresco.com; 'Roy Wetherall' <roy.wetherall@alfresco.com>\n");
|
||||
assertEquals(-1, text.indexOf("BCC:"));
|
||||
assertContains(text, "Subject: This is a test message please ignore\n");
|
||||
assertEquals(-1, text.indexOf("Date:"));
|
||||
assertContains(text, "Date: Mon, 11 Jan 2010 16:25:07 +0000 (GMT)\n");
|
||||
assertContains(text, "The quick brown fox jumps over the lazy dog");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue