mirror of https://github.com/apache/poi.git
Fix bug #49441 - Allow overriding and guessing of HSMF non-unicode string encodings
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@981947 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
452fa02182
commit
62499bc465
|
@ -34,6 +34,7 @@
|
||||||
|
|
||||||
<changes>
|
<changes>
|
||||||
<release version="3.7-beta2" date="2010-??-??">
|
<release version="3.7-beta2" date="2010-??-??">
|
||||||
|
<action dev="POI-DEVELOPERS" type="add">49441 - Allow overriding and guessing of HSMF non-unicode string encodings</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">49689 - Allow the setting of user style names on newly created HSSF cell styles</action>
|
<action dev="POI-DEVELOPERS" type="fix">49689 - Allow the setting of user style names on newly created HSSF cell styles</action>
|
||||||
<action dev="POI-DEVELOPERS" type="add">Make it easier to tell which content types each POIXMLTextExtractor handles</action>
|
<action dev="POI-DEVELOPERS" type="add">Make it easier to tell which content types each POIXMLTextExtractor handles</action>
|
||||||
<action dev="POI-DEVELOPERS" type="fix">49649 - Added clone support for UserSView* and Feat* families of records</action>
|
<action dev="POI-DEVELOPERS" type="fix">49649 - Added clone support for UserSView* and Feat* families of records</action>
|
||||||
|
|
|
@ -25,10 +25,13 @@ import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Calendar;
|
import java.util.Calendar;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.poi.POIDocument;
|
import org.apache.poi.POIDocument;
|
||||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
|
||||||
import org.apache.poi.hsmf.datatypes.AttachmentChunks.AttachmentChunksSorter;
|
import org.apache.poi.hsmf.datatypes.AttachmentChunks.AttachmentChunksSorter;
|
||||||
|
import org.apache.poi.hsmf.datatypes.Chunk;
|
||||||
import org.apache.poi.hsmf.datatypes.ChunkGroup;
|
import org.apache.poi.hsmf.datatypes.ChunkGroup;
|
||||||
import org.apache.poi.hsmf.datatypes.Chunks;
|
import org.apache.poi.hsmf.datatypes.Chunks;
|
||||||
import org.apache.poi.hsmf.datatypes.NameIdChunks;
|
import org.apache.poi.hsmf.datatypes.NameIdChunks;
|
||||||
|
@ -286,10 +289,58 @@ public class MAPIMessage extends POIDocument {
|
||||||
|
|
||||||
return names;
|
return names;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Many messages store their strings as unicode, which is
|
||||||
|
* nice and easy. Some use one-byte encodings for their
|
||||||
|
* strings, but don't easily store the encoding anywhere
|
||||||
|
* in the file!
|
||||||
|
* This method looks at the headers for the message, and
|
||||||
|
* tries to use these to guess the correct encoding for
|
||||||
|
* your file.
|
||||||
|
* Bug #49441 has more on why this is needed
|
||||||
|
*/
|
||||||
|
public void guess7BitEncoding() {
|
||||||
|
try {
|
||||||
|
String[] headers = getHeaders();
|
||||||
|
if(headers == null || headers.length == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for a content type with a charset
|
||||||
|
Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?(.*?)[\"']?");
|
||||||
|
for(String header : headers) {
|
||||||
|
if(header.startsWith("Content-Type")) {
|
||||||
|
Matcher m = p.matcher(header);
|
||||||
|
if(m.matches()) {
|
||||||
|
// Found it! Tell all the string chunks
|
||||||
|
String charset = m.group(1);
|
||||||
|
|
||||||
|
for(Chunk c : mainChunks.getAll()) {
|
||||||
|
if(c instanceof StringChunk) {
|
||||||
|
((StringChunk)c).set7BitEncoding(charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(Chunk c : nameIdChunks.getAll()) {
|
||||||
|
if(c instanceof StringChunk) {
|
||||||
|
((StringChunk)c).set7BitEncoding(charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(RecipientChunks rc : recipientChunks) {
|
||||||
|
for(Chunk c : rc.getAll()) {
|
||||||
|
if(c instanceof StringChunk) {
|
||||||
|
((StringChunk)c).set7BitEncoding(charset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(ChunkNotFoundException e) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns all the headers, one entry per line
|
||||||
*/
|
*/
|
||||||
public String[] getHeaders() throws ChunkNotFoundException {
|
public String[] getHeaders() throws ChunkNotFoundException {
|
||||||
String headers = getStringFromChunk(mainChunks.messageHeaders);
|
String headers = getStringFromChunk(mainChunks.messageHeaders);
|
||||||
|
|
|
@ -30,8 +30,8 @@ import org.apache.poi.util.StringUtil;
|
||||||
* A Chunk made up of a single string.
|
* A Chunk made up of a single string.
|
||||||
*/
|
*/
|
||||||
public class StringChunk extends Chunk {
|
public class StringChunk extends Chunk {
|
||||||
|
|
||||||
private String value;
|
private String value;
|
||||||
|
private String encoding7Bit = "CP1252";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a String Chunk.
|
* Creates a String Chunk.
|
||||||
|
@ -48,13 +48,33 @@ public class StringChunk extends Chunk {
|
||||||
super(chunkId, type);
|
super(chunkId, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the Encoding that will be used to
|
||||||
|
* decode any "7 bit" (non unicode) data.
|
||||||
|
* Most files default to CP1252
|
||||||
|
*/
|
||||||
|
public String get7BitEncoding() {
|
||||||
|
return encoding7Bit;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the Encoding that will be used to
|
||||||
|
* decode any "7 bit" (non unicode) data.
|
||||||
|
* This doesn't appear to be stored anywhere
|
||||||
|
* specific in the file, so you may need
|
||||||
|
* to guess by looking at headers etc
|
||||||
|
*/
|
||||||
|
public void set7BitEncoding(String encoding) {
|
||||||
|
this.encoding7Bit = encoding;
|
||||||
|
}
|
||||||
|
|
||||||
public void readValue(InputStream value) throws IOException {
|
public void readValue(InputStream value) throws IOException {
|
||||||
String tmpValue;
|
String tmpValue;
|
||||||
byte[] data = IOUtils.toByteArray(value);
|
byte[] data = IOUtils.toByteArray(value);
|
||||||
|
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case Types.ASCII_STRING:
|
case Types.ASCII_STRING:
|
||||||
tmpValue = parseAs7BitData(data);
|
tmpValue = parseAs7BitData(data, encoding7Bit);
|
||||||
break;
|
break;
|
||||||
case Types.UNICODE_STRING:
|
case Types.UNICODE_STRING:
|
||||||
tmpValue = StringUtil.getFromUnicodeLE(data);
|
tmpValue = StringUtil.getFromUnicodeLE(data);
|
||||||
|
@ -73,9 +93,9 @@ public class StringChunk extends Chunk {
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case Types.ASCII_STRING:
|
case Types.ASCII_STRING:
|
||||||
try {
|
try {
|
||||||
data = value.getBytes("CP1252");
|
data = value.getBytes(encoding7Bit);
|
||||||
} catch (UnsupportedEncodingException e) {
|
} catch (UnsupportedEncodingException e) {
|
||||||
throw new RuntimeException("Core encoding not found, JVM broken?", e);
|
throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case Types.UNICODE_STRING:
|
case Types.UNICODE_STRING:
|
||||||
|
@ -101,10 +121,17 @@ public class StringChunk extends Chunk {
|
||||||
* and returns the string that that yields.
|
* and returns the string that that yields.
|
||||||
*/
|
*/
|
||||||
protected static String parseAs7BitData(byte[] data) {
|
protected static String parseAs7BitData(byte[] data) {
|
||||||
|
return parseAs7BitData(data, "CP1252");
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Parses as non-unicode, supposedly 7 bit data
|
||||||
|
* and returns the string that that yields.
|
||||||
|
*/
|
||||||
|
protected static String parseAs7BitData(byte[] data, String encoding) {
|
||||||
try {
|
try {
|
||||||
return new String(data, "CP1252");
|
return new String(data, encoding);
|
||||||
} catch (UnsupportedEncodingException e) {
|
} catch (UnsupportedEncodingException e) {
|
||||||
throw new RuntimeException("Core encoding not found, JVM broken?", e);
|
throw new RuntimeException("Encoding not found - " + encoding, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,7 @@ public final class TestBasics extends TestCase {
|
||||||
private MAPIMessage outlook30;
|
private MAPIMessage outlook30;
|
||||||
private MAPIMessage attachments;
|
private MAPIMessage attachments;
|
||||||
private MAPIMessage noRecipientAddress;
|
private MAPIMessage noRecipientAddress;
|
||||||
|
private MAPIMessage cyrillic;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize this test, load up the blank.msg mapi message.
|
* Initialize this test, load up the blank.msg mapi message.
|
||||||
|
@ -46,6 +47,7 @@ public final class TestBasics extends TestCase {
|
||||||
outlook30 = new MAPIMessage(samples.openResourceAsStream("outlook_30_msg.msg"));
|
outlook30 = new MAPIMessage(samples.openResourceAsStream("outlook_30_msg.msg"));
|
||||||
attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
|
attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
|
||||||
noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
|
noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
|
||||||
|
cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -177,4 +179,21 @@ public final class TestBasics extends TestCase {
|
||||||
|
|
||||||
noRecipientAddress.setReturnNullOnMissingChunk(false);
|
noRecipientAddress.setReturnNullOnMissingChunk(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We default to CP1252, but can sometimes do better
|
||||||
|
* if needed.
|
||||||
|
* This file is really CP1251, according to the person
|
||||||
|
* who submitted it in bug #49441
|
||||||
|
*/
|
||||||
|
public void testEncoding() throws Exception {
|
||||||
|
assertEquals(2, cyrillic.getRecipientDetailsChunks().length);
|
||||||
|
assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
|
||||||
|
cyrillic.guess7BitEncoding();
|
||||||
|
|
||||||
|
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue