Update OutlookTextExtractor to request 7 bit encoding guessing

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1087734 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2011-04-01 15:02:14 +00:00
parent dcdb268d3b
commit 296abdab3b
4 changed files with 62 additions and 19 deletions

View File

@ -34,6 +34,7 @@
<changes> <changes>
<release version="3.8-beta3" date="2011-??-??"> <release version="3.8-beta3" date="2011-??-??">
<action dev="poi-developers" type="fix">OutlookTextExtractor now requests 7 bit encoding guessing</action>
<action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action> <action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
<action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action> <action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
</release> </release>

View File

@ -32,9 +32,8 @@ import org.apache.poi.util.StringUtil;
public class StringChunk extends Chunk { public class StringChunk extends Chunk {
private static final String DEFAULT_ENCODING = "CP1252"; private static final String DEFAULT_ENCODING = "CP1252";
private String encoding7Bit = DEFAULT_ENCODING; private String encoding7Bit = DEFAULT_ENCODING;
private String value;
/** Only kept around for 7 bit strings */
private byte[] rawValue; private byte[] rawValue;
private String value;
/** /**
* Creates a String Chunk. * Creates a String Chunk.
@ -72,23 +71,22 @@ public class StringChunk extends Chunk {
// Re-read the String if we're a 7 bit one // Re-read the String if we're a 7 bit one
if(type == Types.ASCII_STRING) { if(type == Types.ASCII_STRING) {
parseString(rawValue); parseString();
} }
} }
public void readValue(InputStream value) throws IOException { public void readValue(InputStream value) throws IOException {
byte[] data = IOUtils.toByteArray(value); rawValue = IOUtils.toByteArray(value);
parseString(data); parseString();
} }
private void parseString(byte[] data) { private void parseString() {
String tmpValue; String tmpValue;
switch(type) { switch(type) {
case Types.ASCII_STRING: case Types.ASCII_STRING:
tmpValue = parseAs7BitData(data, encoding7Bit); tmpValue = parseAs7BitData(rawValue, encoding7Bit);
this.rawValue = data;
break; break;
case Types.UNICODE_STRING: case Types.UNICODE_STRING:
tmpValue = StringUtil.getFromUnicodeLE(data); tmpValue = StringUtil.getFromUnicodeLE(rawValue);
break; break;
default: default:
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk"); throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
@ -99,34 +97,46 @@ public class StringChunk extends Chunk {
} }
public void writeValue(OutputStream out) throws IOException { public void writeValue(OutputStream out) throws IOException {
byte[] data; out.write(rawValue);
}
private void storeString() {
switch(type) { switch(type) {
case Types.ASCII_STRING: case Types.ASCII_STRING:
try { try {
data = value.getBytes(encoding7Bit); rawValue = value.getBytes(encoding7Bit);
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
throw new RuntimeException("Encoding not found - " + encoding7Bit, e); throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
} }
break; break;
case Types.UNICODE_STRING: case Types.UNICODE_STRING:
data = new byte[value.length()*2]; rawValue = new byte[value.length()*2];
StringUtil.putUnicodeLE(value, data, 0); StringUtil.putUnicodeLE(value, rawValue, 0);
break; break;
default: default:
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk"); throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
} }
out.write(data);
} }
/**
* Returns the Text value of the chunk
*/
public String getValue() { public String getValue() {
return this.value; return this.value;
} }
public String toString() {
return this.value; public byte[] getRawValue() {
} return this.rawValue;
}
public void setValue(String str) {
this.value = str;
storeString();
}
public String toString() {
return this.value;
}
/** /**
* Parses as non-unicode, supposedly 7 bit CP1252 data * Parses as non-unicode, supposedly 7 bit CP1252 data
* and returns the string that that yields. * and returns the string that that yields.

View File

@ -16,6 +16,7 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.hsmf.extractor; package org.apache.poi.hsmf.extractor;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
@ -56,6 +57,15 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
public OutlookTextExtactor(InputStream inp) throws IOException { public OutlookTextExtactor(InputStream inp) throws IOException {
this(new MAPIMessage(inp)); this(new MAPIMessage(inp));
} }
public static void main(String[] args) throws Exception {
for(String filename : args) {
OutlookTextExtactor extractor = new OutlookTextExtactor(
new NPOIFSFileSystem(new File(filename))
);
System.out.println( extractor.getText() );
}
}
/** /**
* Returns the underlying MAPI message * Returns the underlying MAPI message
@ -71,6 +81,11 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
MAPIMessage msg = (MAPIMessage)document; MAPIMessage msg = (MAPIMessage)document;
StringBuffer s = new StringBuffer(); StringBuffer s = new StringBuffer();
// See if we can get a suitable encoding for any
// non unicode text in the file
msg.guess7BitEncoding();
// Off we go
StringsIterator emails; StringsIterator emails;
try { try {
emails = new StringsIterator( emails = new StringsIterator(

View File

@ -199,4 +199,21 @@ public final class TestOutlookTextExtractor extends TestCase {
// Embeded bits are checked in // Embeded bits are checked in
// TestExtractorFactory // TestExtractorFactory
} }
public void testEncodings() throws Exception {
POIFSFileSystem simple = new POIFSFileSystem(
new FileInputStream(samples.getFile("chinese-traditional.msg"))
);
MAPIMessage msg = new MAPIMessage(simple);
OutlookTextExtactor ext = new OutlookTextExtactor(msg);
String text = ext.getText();
// Check the english bits
assertContains(text, "From: Tests Chang@FT");
assertContains(text, "tests.chang@fengttt.com");
// And check some chinese bits
assertContains(text, "(\u5f35\u6bd3\u502b)");
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
}
} }