mirror of https://github.com/apache/poi.git
Track the codepage in old excel files, to be able to correctly decode the 8 bit strings in them
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642561 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1c7a6d9254
commit
0d21e6e1da
|
@ -25,6 +25,7 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
|
||||||
import org.apache.poi.hssf.record.BOFRecord;
|
import org.apache.poi.hssf.record.BOFRecord;
|
||||||
|
import org.apache.poi.hssf.record.CodepageRecord;
|
||||||
import org.apache.poi.hssf.record.FormulaRecord;
|
import org.apache.poi.hssf.record.FormulaRecord;
|
||||||
import org.apache.poi.hssf.record.NumberRecord;
|
import org.apache.poi.hssf.record.NumberRecord;
|
||||||
import org.apache.poi.hssf.record.OldFormulaRecord;
|
import org.apache.poi.hssf.record.OldFormulaRecord;
|
||||||
|
@ -110,6 +111,8 @@ public class OldExcelExtractor {
|
||||||
* for these old file formats
|
* for these old file formats
|
||||||
*/
|
*/
|
||||||
public String getText() {
|
public String getText() {
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
// Work out what version we're dealing with
|
// Work out what version we're dealing with
|
||||||
int bofSid = ris.getNextSid();
|
int bofSid = ris.getNextSid();
|
||||||
switch (bofSid) {
|
switch (bofSid) {
|
||||||
|
@ -129,7 +132,9 @@ public class OldExcelExtractor {
|
||||||
throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid);
|
throw new IllegalArgumentException("File does not begin with a BOF, found sid of " + bofSid);
|
||||||
}
|
}
|
||||||
|
|
||||||
StringBuffer text = new StringBuffer();
|
// To track formats and encodings
|
||||||
|
CodepageRecord codepage = null;
|
||||||
|
|
||||||
while (ris.hasNextRecord()) {
|
while (ris.hasNextRecord()) {
|
||||||
int sid = ris.getNextSid();
|
int sid = ris.getNextSid();
|
||||||
ris.nextRecord();
|
ris.nextRecord();
|
||||||
|
@ -139,6 +144,7 @@ public class OldExcelExtractor {
|
||||||
case OldLabelRecord.biff2_sid:
|
case OldLabelRecord.biff2_sid:
|
||||||
case OldLabelRecord.biff345_sid:
|
case OldLabelRecord.biff345_sid:
|
||||||
OldLabelRecord lr = new OldLabelRecord(ris);
|
OldLabelRecord lr = new OldLabelRecord(ris);
|
||||||
|
lr.setCodePage(codepage);
|
||||||
text.append(lr.getValue());
|
text.append(lr.getValue());
|
||||||
text.append('\n');
|
text.append('\n');
|
||||||
break;
|
break;
|
||||||
|
@ -146,6 +152,7 @@ public class OldExcelExtractor {
|
||||||
case OldStringRecord.biff2_sid:
|
case OldStringRecord.biff2_sid:
|
||||||
case OldStringRecord.biff345_sid:
|
case OldStringRecord.biff345_sid:
|
||||||
OldStringRecord sr = new OldStringRecord(ris);
|
OldStringRecord sr = new OldStringRecord(ris);
|
||||||
|
sr.setCodePage(codepage);
|
||||||
text.append(sr.getString());
|
text.append(sr.getString());
|
||||||
text.append('\n');
|
text.append('\n');
|
||||||
break;
|
break;
|
||||||
|
@ -175,6 +182,10 @@ public class OldExcelExtractor {
|
||||||
handleNumericCell(text, rr.getRKNumber());
|
handleNumericCell(text, rr.getRKNumber());
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case CodepageRecord.sid:
|
||||||
|
codepage = new CodepageRecord(ris);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
ris.readFully(new byte[ris.remaining()]);
|
ris.readFully(new byte[ris.remaining()]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,13 +19,15 @@
|
||||||
|
|
||||||
package org.apache.poi.hssf.record;
|
package org.apache.poi.hssf.record;
|
||||||
|
|
||||||
|
import org.apache.poi.util.CodePageUtil;
|
||||||
import org.apache.poi.util.LittleEndianOutput;
|
import org.apache.poi.util.LittleEndianOutput;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Title: Codepage Record<P>
|
* Title: Codepage Record
|
||||||
* Description: the default characterset. for the workbook<P>
|
* <p>Description: the default characterset. for the workbook</p>
|
||||||
* REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
|
* <p>REFERENCE: PG 293 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)</p>
|
||||||
* @author Andrew C. Oliver (acoliver at apache dot org)
|
* <p>Use {@link CodePageUtil} to turn these values into Java code pages
|
||||||
|
* to encode/decode strings.</p>
|
||||||
* @version 2.0-pre
|
* @version 2.0-pre
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -36,11 +38,10 @@ public final class CodepageRecord
|
||||||
private short field_1_codepage; // = 0;
|
private short field_1_codepage; // = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the likely correct value for CODEPAGE (at least for US versions). We could use
|
* Excel 97+ (Biff 8) should always store strings as UTF-16LE or
|
||||||
* some help with international versions (which we do not have access to documentation
|
* compressed versions of that. As such, this should always be
|
||||||
* for)
|
* 0x4b0 = UTF_16, except for files coming from older versions.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public final static short CODEPAGE = ( short ) 0x4b0;
|
public final static short CODEPAGE = ( short ) 0x4b0;
|
||||||
|
|
||||||
public CodepageRecord()
|
public CodepageRecord()
|
||||||
|
|
|
@ -32,9 +32,9 @@ public final class OldLabelRecord extends OldCellRecord {
|
||||||
public final static short biff2_sid = 0x0004;
|
public final static short biff2_sid = 0x0004;
|
||||||
public final static short biff345_sid = 0x0204;
|
public final static short biff345_sid = 0x0204;
|
||||||
|
|
||||||
private short field_4_string_len;
|
private short field_4_string_len;
|
||||||
private byte[] field_5_bytes;
|
private byte[] field_5_bytes;
|
||||||
//private XXXXX codepage; // TODO Implement for this and OldStringRecord
|
private CodepageRecord codepage;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param in the RecordInputstream to read the record from
|
* @param in the RecordInputstream to read the record from
|
||||||
|
@ -61,6 +61,10 @@ public final class OldLabelRecord extends OldCellRecord {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setCodePage(CodepageRecord codepage) {
|
||||||
|
this.codepage = codepage;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get the number of characters this string contains
|
* get the number of characters this string contains
|
||||||
* @return number of characters
|
* @return number of characters
|
||||||
|
@ -75,8 +79,7 @@ public final class OldLabelRecord extends OldCellRecord {
|
||||||
*/
|
*/
|
||||||
public String getValue()
|
public String getValue()
|
||||||
{
|
{
|
||||||
// We really need the codepage here to do this right...
|
return OldStringRecord.getString(field_5_bytes, codepage);
|
||||||
return new String(field_5_bytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -17,6 +17,10 @@
|
||||||
|
|
||||||
package org.apache.poi.hssf.record;
|
package org.apache.poi.hssf.record;
|
||||||
|
|
||||||
|
import java.io.UnsupportedEncodingException;
|
||||||
|
|
||||||
|
import org.apache.poi.util.CodePageUtil;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for
|
* Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for
|
||||||
|
@ -29,7 +33,7 @@ public final class OldStringRecord {
|
||||||
private short sid;
|
private short sid;
|
||||||
private short field_1_string_len;
|
private short field_1_string_len;
|
||||||
private byte[] field_2_bytes;
|
private byte[] field_2_bytes;
|
||||||
//private XXXXX codepage; // TODO Implement for this and OldLabelRecord
|
private CodepageRecord codepage;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param in the RecordInputstream to read the record from
|
* @param in the RecordInputstream to read the record from
|
||||||
|
@ -56,13 +60,29 @@ public final class OldStringRecord {
|
||||||
return sid;
|
return sid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setCodePage(CodepageRecord codepage) {
|
||||||
|
this.codepage = codepage;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return The string represented by this record.
|
* @return The string represented by this record.
|
||||||
*/
|
*/
|
||||||
public String getString()
|
public String getString()
|
||||||
{
|
{
|
||||||
// We really need the codepage here to do this right...
|
return getString(field_2_bytes, codepage);
|
||||||
return new String(field_2_bytes);
|
}
|
||||||
|
|
||||||
|
protected static String getString(byte[] data, CodepageRecord codepage) {
|
||||||
|
int cp = CodePageUtil.CP_ISO_8859_1;
|
||||||
|
if (codepage != null) {
|
||||||
|
cp = codepage.getCodepage();
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
return CodePageUtil.getStringFromCodePage(data, cp);
|
||||||
|
} catch (UnsupportedEncodingException uee) {
|
||||||
|
// Hope the system default is ok...
|
||||||
|
return new String(data);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString()
|
public String toString()
|
||||||
|
|
|
@ -81,8 +81,7 @@ public final class TestOldExcelExtractor extends POITestCase {
|
||||||
// More complicated strings
|
// More complicated strings
|
||||||
assertContains(text, "$100,000 or more");
|
assertContains(text, "$100,000 or more");
|
||||||
assertContains(text, "S corporation returns, Form 1120S [10,15]");
|
assertContains(text, "S corporation returns, Form 1120S [10,15]");
|
||||||
// TODO Get these quotes working correctly
|
assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
|
||||||
// assertContains(text, "individual income tax return \u201Cshort forms.\u201D");
|
|
||||||
|
|
||||||
// Formula based strings
|
// Formula based strings
|
||||||
// TODO Find some then test
|
// TODO Find some then test
|
||||||
|
|
Loading…
Reference in New Issue