Track the codepage in old excel files, to be able to correctly decode the 8 bit strings in them

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642561 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2014-11-30 16:21:39 +00:00
parent 1c7a6d9254
commit 0d21e6e1da
5 changed files with 53 additions and 19 deletions

View File

@ -25,6 +25,7 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.CodepageRecord;
import org.apache.poi.hssf.record.FormulaRecord; import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.OldFormulaRecord; import org.apache.poi.hssf.record.OldFormulaRecord;
@ -110,6 +111,8 @@ public class OldExcelExtractor {
* for these old file formats * for these old file formats
*/ */
public String getText() { public String getText() {
StringBuffer text = new StringBuffer();
// Work out what version we're dealing with // Work out what version we're dealing with
int bofSid = ris.getNextSid(); int bofSid = ris.getNextSid();
switch (bofSid) { switch (bofSid) {
@ -128,8 +131,10 @@ public class OldExcelExtractor {
default: default:
throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid); throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid);
} }
// To track formats and encodings
CodepageRecord codepage = null;
StringBuffer text = new StringBuffer();
while (ris.hasNextRecord()) { while (ris.hasNextRecord()) {
int sid = ris.getNextSid(); int sid = ris.getNextSid();
ris.nextRecord(); ris.nextRecord();
@ -139,6 +144,7 @@ public class OldExcelExtractor {
case OldLabelRecord.biff2_sid: case OldLabelRecord.biff2_sid:
case OldLabelRecord.biff345_sid: case OldLabelRecord.biff345_sid:
OldLabelRecord lr = new OldLabelRecord(ris); OldLabelRecord lr = new OldLabelRecord(ris);
lr.setCodePage(codepage);
text.append(lr.getValue()); text.append(lr.getValue());
text.append('\n'); text.append('\n');
break; break;
@ -146,6 +152,7 @@ public class OldExcelExtractor {
case OldStringRecord.biff2_sid: case OldStringRecord.biff2_sid:
case OldStringRecord.biff345_sid: case OldStringRecord.biff345_sid:
OldStringRecord sr = new OldStringRecord(ris); OldStringRecord sr = new OldStringRecord(ris);
sr.setCodePage(codepage);
text.append(sr.getString()); text.append(sr.getString());
text.append('\n'); text.append('\n');
break; break;
@ -175,6 +182,10 @@ public class OldExcelExtractor {
handleNumericCell(text, rr.getRKNumber()); handleNumericCell(text, rr.getRKNumber());
break; break;
case CodepageRecord.sid:
codepage = new CodepageRecord(ris);
break;
default: default:
ris.readFully(new byte[ris.remaining()]); ris.readFully(new byte[ris.remaining()]);
} }

View File

@ -19,13 +19,15 @@
package org.apache.poi.hssf.record; package org.apache.poi.hssf.record;
import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.LittleEndianOutput; import org.apache.poi.util.LittleEndianOutput;
/** /**
* Title: Codepage Record<P> * Title: Codepage Record
* Description: the default characterset. for the workbook<P> * <p>Description: the default characterset. for the workbook</p>
* REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P> * <p>REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)</p>
* @author Andrew C. Oliver (acoliver at apache dot org) * <p>Use {@link CodePageUtil} to turn these values into Java code pages
* to encode/decode strings.</p>
* @version 2.0-pre * @version 2.0-pre
*/ */
@ -36,11 +38,10 @@ public final class CodepageRecord
private short field_1_codepage; // = 0; private short field_1_codepage; // = 0;
/** /**
* the likely correct value for CODEPAGE (at least for US versions). We could use * Excel 97+ (Biff 8) should always store strings as UTF-16LE or
* some help with international versions (which we do not have access to documentation * compressed versions of that. As such, this should always be
* for) * 0x4b0 = UTF_16, except for files coming from older versions.
*/ */
public final static short CODEPAGE = ( short ) 0x4b0; public final static short CODEPAGE = ( short ) 0x4b0;
public CodepageRecord() public CodepageRecord()

View File

@ -32,9 +32,9 @@ public final class OldLabelRecord extends OldCellRecord {
public final static short biff2_sid = 0x0004; public final static short biff2_sid = 0x0004;
public final static short biff345_sid = 0x0204; public final static short biff345_sid = 0x0204;
private short field_4_string_len; private short field_4_string_len;
private byte[] field_5_bytes; private byte[] field_5_bytes;
//private XXXXX codepage; // TODO Implement for this and OldStringRecord private CodepageRecord codepage;
/** /**
* @param in the RecordInputstream to read the record from * @param in the RecordInputstream to read the record from
@ -61,6 +61,10 @@ public final class OldLabelRecord extends OldCellRecord {
} }
} }
public void setCodePage(CodepageRecord codepage) {
this.codepage = codepage;
}
/** /**
* get the number of characters this string contains * get the number of characters this string contains
* @return number of characters * @return number of characters
@ -75,8 +79,7 @@ public final class OldLabelRecord extends OldCellRecord {
*/ */
public String getValue() public String getValue()
{ {
// We really need the codepage here to do this right... return OldStringRecord.getString(field_5_bytes, codepage);
return new String(field_5_bytes);
} }
/** /**

View File

@ -17,6 +17,10 @@
package org.apache.poi.hssf.record; package org.apache.poi.hssf.record;
import java.io.UnsupportedEncodingException;
import org.apache.poi.util.CodePageUtil;
/** /**
* Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for * Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for
@ -29,7 +33,7 @@ public final class OldStringRecord {
private short sid; private short sid;
private short field_1_string_len; private short field_1_string_len;
private byte[] field_2_bytes; private byte[] field_2_bytes;
//private XXXXX codepage; // TODO Implement for this and OldLabelRecord private CodepageRecord codepage;
/** /**
* @param in the RecordInputstream to read the record from * @param in the RecordInputstream to read the record from
@ -55,14 +59,30 @@ public final class OldStringRecord {
public short getSid() { public short getSid() {
return sid; return sid;
} }
public void setCodePage(CodepageRecord codepage) {
this.codepage = codepage;
}
/** /**
* @return The string represented by this record. * @return The string represented by this record.
*/ */
public String getString() public String getString()
{ {
// We really need the codepage here to do this right... return getString(field_2_bytes, codepage);
return new String(field_2_bytes); }
protected static String getString(byte[] data, CodepageRecord codepage) {
int cp = CodePageUtil.CP_ISO_8859_1;
if (codepage != null) {
cp = codepage.getCodepage();
}
try {
return CodePageUtil.getStringFromCodePage(data, cp);
} catch (UnsupportedEncodingException uee) {
// Hope the system default is ok...
return new String(data);
}
} }
public String toString() public String toString()

View File

@ -81,8 +81,7 @@ public final class TestOldExcelExtractor extends POITestCase {
// More complicated strings // More complicated strings
assertContains(text, "$100,000 or more"); assertContains(text, "$100,000 or more");
assertContains(text, "S corporation returns, Form 1120S [10,15]"); assertContains(text, "S corporation returns, Form 1120S [10,15]");
// TODO Get these quotes working correctly assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
// assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
// Formula based strings // Formula based strings
// TODO Find some then test // TODO Find some then test