mirror of https://github.com/apache/poi.git
Update OutlookTextExtractor to request 7 bit encoding guessing
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1087734 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dcdb268d3b
commit
296abdab3b
|
@ -34,6 +34,7 @@
|
||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.8-beta3" date="2011-??-??">
|
<release version="3.8-beta3" date="2011-??-??">
|
||||||
|
<action dev="poi-developers" type="fix">OutlookTextExtractor now requests 7 bit encoding guessing</action>
|
||||||
<action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
|
<action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
|
||||||
<action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
|
<action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
|
||||||
</release>
|
</release>
|
||||||
|
|
|
@ -32,9 +32,8 @@ import org.apache.poi.util.StringUtil;
|
||||||
public class StringChunk extends Chunk {
|
public class StringChunk extends Chunk {
|
||||||
private static final String DEFAULT_ENCODING = "CP1252";
|
private static final String DEFAULT_ENCODING = "CP1252";
|
||||||
private String encoding7Bit = DEFAULT_ENCODING;
|
private String encoding7Bit = DEFAULT_ENCODING;
|
||||||
private String value;
|
|
||||||
/** Only kept around for 7 bit strings */
|
|
||||||
private byte[] rawValue;
|
private byte[] rawValue;
|
||||||
|
private String value;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a String Chunk.
|
* Creates a String Chunk.
|
||||||
|
@ -72,23 +71,22 @@ public class StringChunk extends Chunk {
|
||||||
|
|
||||||
// Re-read the String if we're a 7 bit one
|
// Re-read the String if we're a 7 bit one
|
||||||
if(type == Types.ASCII_STRING) {
|
if(type == Types.ASCII_STRING) {
|
||||||
parseString(rawValue);
|
parseString();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void readValue(InputStream value) throws IOException {
|
public void readValue(InputStream value) throws IOException {
|
||||||
byte[] data = IOUtils.toByteArray(value);
|
rawValue = IOUtils.toByteArray(value);
|
||||||
parseString(data);
|
parseString();
|
||||||
}
|
}
|
||||||
private void parseString(byte[] data) {
|
private void parseString() {
|
||||||
String tmpValue;
|
String tmpValue;
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case Types.ASCII_STRING:
|
case Types.ASCII_STRING:
|
||||||
tmpValue = parseAs7BitData(data, encoding7Bit);
|
tmpValue = parseAs7BitData(rawValue, encoding7Bit);
|
||||||
this.rawValue = data;
|
|
||||||
break;
|
break;
|
||||||
case Types.UNICODE_STRING:
|
case Types.UNICODE_STRING:
|
||||||
tmpValue = StringUtil.getFromUnicodeLE(data);
|
tmpValue = StringUtil.getFromUnicodeLE(rawValue);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
|
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
|
||||||
|
@ -99,34 +97,46 @@ public class StringChunk extends Chunk {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeValue(OutputStream out) throws IOException {
|
public void writeValue(OutputStream out) throws IOException {
|
||||||
byte[] data;
|
out.write(rawValue);
|
||||||
|
}
|
||||||
|
private void storeString() {
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case Types.ASCII_STRING:
|
case Types.ASCII_STRING:
|
||||||
try {
|
try {
|
||||||
data = value.getBytes(encoding7Bit);
|
rawValue = value.getBytes(encoding7Bit);
|
||||||
} catch (UnsupportedEncodingException e) {
|
} catch (UnsupportedEncodingException e) {
|
||||||
throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
|
throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case Types.UNICODE_STRING:
|
case Types.UNICODE_STRING:
|
||||||
data = new byte[value.length()*2];
|
rawValue = new byte[value.length()*2];
|
||||||
StringUtil.putUnicodeLE(value, data, 0);
|
StringUtil.putUnicodeLE(value, rawValue, 0);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
|
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
|
||||||
}
|
}
|
||||||
|
|
||||||
out.write(data);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the Text value of the chunk
|
||||||
|
*/
|
||||||
public String getValue() {
|
public String getValue() {
|
||||||
return this.value;
|
return this.value;
|
||||||
}
|
}
|
||||||
public String toString() {
|
|
||||||
return this.value;
|
public byte[] getRawValue() {
|
||||||
}
|
return this.rawValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setValue(String str) {
|
||||||
|
this.value = str;
|
||||||
|
storeString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return this.value;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses as non-unicode, supposedly 7 bit CP1252 data
|
* Parses as non-unicode, supposedly 7 bit CP1252 data
|
||||||
* and returns the string that that yields.
|
* and returns the string that that yields.
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.hsmf.extractor;
|
package org.apache.poi.hsmf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
|
@ -56,6 +57,15 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
|
||||||
public OutlookTextExtactor(InputStream inp) throws IOException {
|
public OutlookTextExtactor(InputStream inp) throws IOException {
|
||||||
this(new MAPIMessage(inp));
|
this(new MAPIMessage(inp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
for(String filename : args) {
|
||||||
|
OutlookTextExtactor extractor = new OutlookTextExtactor(
|
||||||
|
new NPOIFSFileSystem(new File(filename))
|
||||||
|
);
|
||||||
|
System.out.println( extractor.getText() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the underlying MAPI message
|
* Returns the underlying MAPI message
|
||||||
|
@ -71,6 +81,11 @@ public class OutlookTextExtactor extends POIOLE2TextExtractor {
|
||||||
MAPIMessage msg = (MAPIMessage)document;
|
MAPIMessage msg = (MAPIMessage)document;
|
||||||
StringBuffer s = new StringBuffer();
|
StringBuffer s = new StringBuffer();
|
||||||
|
|
||||||
|
// See if we can get a suitable encoding for any
|
||||||
|
// non unicode text in the file
|
||||||
|
msg.guess7BitEncoding();
|
||||||
|
|
||||||
|
// Off we go
|
||||||
StringsIterator emails;
|
StringsIterator emails;
|
||||||
try {
|
try {
|
||||||
emails = new StringsIterator(
|
emails = new StringsIterator(
|
||||||
|
|
|
@ -199,4 +199,21 @@ public final class TestOutlookTextExtractor extends TestCase {
|
||||||
// Embeded bits are checked in
|
// Embeded bits are checked in
|
||||||
// TestExtractorFactory
|
// TestExtractorFactory
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEncodings() throws Exception {
|
||||||
|
POIFSFileSystem simple = new POIFSFileSystem(
|
||||||
|
new FileInputStream(samples.getFile("chinese-traditional.msg"))
|
||||||
|
);
|
||||||
|
MAPIMessage msg = new MAPIMessage(simple);
|
||||||
|
OutlookTextExtactor ext = new OutlookTextExtactor(msg);
|
||||||
|
String text = ext.getText();
|
||||||
|
|
||||||
|
// Check the english bits
|
||||||
|
assertContains(text, "From: Tests Chang@FT");
|
||||||
|
assertContains(text, "tests.chang@fengttt.com");
|
||||||
|
|
||||||
|
// And check some chinese bits
|
||||||
|
assertContains(text, "(\u5f35\u6bd3\u502b)");
|
||||||
|
assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue