mirror of https://github.com/apache/poi.git
Further Excel 4 text extractor support, for TIKA-1490
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642491 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
73bd034c79
commit
ff4b0376c8
|
@ -22,9 +22,13 @@ import java.io.FileInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.hssf.record.LabelRecord;
|
||||
import org.apache.poi.hssf.record.FormulaRecord;
|
||||
import org.apache.poi.hssf.record.NumberRecord;
|
||||
import org.apache.poi.hssf.record.OldLabelRecord;
|
||||
import org.apache.poi.hssf.record.OldStringRecord;
|
||||
import org.apache.poi.hssf.record.RKRecord;
|
||||
import org.apache.poi.hssf.record.RecordInputStream;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
|
||||
/**
|
||||
* A text extractor for very old (pre-OLE2) Excel files,
|
||||
|
@ -76,20 +80,44 @@ public class OldExcelExtractor {
|
|||
ris.nextRecord();
|
||||
|
||||
switch (sid) {
|
||||
case LabelRecord.sid:
|
||||
// label - 5.63 - TODO Needs codepages
|
||||
case OldLabelRecord.biff2_sid:
|
||||
case OldLabelRecord.biff345_sid:
|
||||
OldLabelRecord lr = new OldLabelRecord(ris);
|
||||
text.append(lr.getValue());
|
||||
text.append('\n');
|
||||
break;
|
||||
// string - 5.102 - TODO Needs codepages
|
||||
case OldStringRecord.biff2_sid:
|
||||
case OldStringRecord.biff345_sid:
|
||||
OldStringRecord sr = new OldStringRecord(ris);
|
||||
text.append(sr.getString());
|
||||
text.append('\n');
|
||||
break;
|
||||
// number - 5.71 - TODO Needs format strings
|
||||
case NumberRecord.sid:
|
||||
NumberRecord nr = new NumberRecord(ris);
|
||||
text.append(nr.getValue());
|
||||
text.append('\n');
|
||||
break;
|
||||
/*
|
||||
case OldFormulaRecord.sid:
|
||||
FormulaRecord fr = new FormulaRecord(ris);
|
||||
System.out.println(fr.getCachedResultType());
|
||||
if (fr.getCachedResultType() == Cell.CELL_TYPE_NUMERIC) {
|
||||
text.append(fr.getValue());
|
||||
text.append('\n');
|
||||
}
|
||||
*/
|
||||
case RKRecord.sid:
|
||||
RKRecord rr = new RKRecord(ris);
|
||||
text.append(rr.getRKNumber());
|
||||
text.append('\n');
|
||||
break;
|
||||
default:
|
||||
ris.readFully(new byte[ris.remaining()]);
|
||||
// text.append(" = " + ris.getSid() + " = \n");
|
||||
}
|
||||
|
||||
// label - 5.63 - TODO Needs codepages
|
||||
// number - 5.71
|
||||
// rk - 5.87
|
||||
// string - 5.102
|
||||
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.poi.util.LittleEndianOutput;
|
|||
public final class FormulaRecord extends CellRecord {
|
||||
|
||||
public static final short sid = 0x0006; // docs say 406...because of a bug Microsoft support site article #Q184647)
|
||||
public static final short olderSid = 0x0406; // older biff versions do manage 406!
|
||||
private static int FIXED_SIZE = 14; // double + short + int
|
||||
|
||||
private static final BitField alwaysCalc = BitFieldFactory.getInstance(0x0001);
|
||||
|
|
|
@ -39,7 +39,7 @@ public final class OldLabelRecord extends Record implements CellValueRecordInter
|
|||
private short field_3_xf_index; // Biff 3+
|
||||
private short field_4_string_len;
|
||||
private byte[] field_5_bytes;
|
||||
//private XXXXX codepage; // TODO
|
||||
//private XXXXX codepage; // TODO Implement for this and OldStringRecord
|
||||
|
||||
/**
|
||||
* @param in the RecordInputstream to read the record from
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hssf.record;
|
||||
|
||||
|
||||
/**
|
||||
* Biff2 - Biff 4 Label Record (0x0007 / 0x0207) - read only support for
|
||||
* formula string results.
|
||||
*/
|
||||
public final class OldStringRecord {
|
||||
public final static short biff2_sid = 0x0007;
|
||||
public final static short biff345_sid = 0x0207;
|
||||
|
||||
private short sid;
|
||||
private short field_1_string_len;
|
||||
private byte[] field_2_bytes;
|
||||
//private XXXXX codepage; // TODO Implement for this and OldLabelRecord
|
||||
|
||||
/**
|
||||
* @param in the RecordInputstream to read the record from
|
||||
*/
|
||||
public OldStringRecord(RecordInputStream in) {
|
||||
sid = in.getSid();
|
||||
|
||||
if (in.getSid() == biff2_sid) {
|
||||
field_1_string_len = (short)in.readUByte();
|
||||
} else {
|
||||
field_1_string_len = in.readShort();
|
||||
}
|
||||
|
||||
// Can only decode properly later when you know the codepage
|
||||
field_2_bytes = new byte[field_1_string_len];
|
||||
in.read(field_2_bytes, 0, field_1_string_len);
|
||||
}
|
||||
|
||||
public boolean isBiff2() {
|
||||
return sid == biff2_sid;
|
||||
}
|
||||
|
||||
public short getSid() {
|
||||
return sid;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The string represented by this record.
|
||||
*/
|
||||
public String getString()
|
||||
{
|
||||
// We really need the codepage here to do this right...
|
||||
return new String(field_2_bytes);
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
|
||||
buffer.append("[OLD STRING]\n");
|
||||
buffer.append(" .string = ")
|
||||
.append(getString()).append("\n");
|
||||
buffer.append("[/OLD STRING]\n");
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -46,7 +46,45 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||
// Check we find a few words we expect in there
|
||||
assertTrue(text, text.contains("Size"));
|
||||
assertTrue(text, text.contains("Returns"));
|
||||
|
||||
// Check we find a few numbers we expect in there
|
||||
assertTrue(text, text.contains("11"));
|
||||
assertTrue(text, text.contains("784"));
|
||||
}
|
||||
|
||||
// TODO Rest of the tests
|
||||
public void testStrings() {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||
String text = extractor.getText();
|
||||
|
||||
// Simple strings
|
||||
assertTrue(text, text.contains("Table 10 -- Examination Coverage:"));
|
||||
assertTrue(text, text.contains("Recommended and Average Recommended Additional Tax After"));
|
||||
assertTrue(text, text.contains("Individual income tax returns, total"));
|
||||
|
||||
// More complicated strings
|
||||
assertTrue(text, text.contains("$100,000 or more"));
|
||||
assertTrue(text, text.contains("S corporation returns, Form 1120S [10,15]"));
|
||||
// TODO Get these quotes working correctly
|
||||
// assertTrue(text, text.contains("individual income tax return “short forms.”"));
|
||||
|
||||
// Formula based strings
|
||||
// TODO Find some then test
|
||||
}
|
||||
|
||||
public void testFormattedNumbers() {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||
String text = extractor.getText();
|
||||
|
||||
// Simple numbers
|
||||
assertTrue(text, text.contains("151"));
|
||||
assertTrue(text, text.contains("784"));
|
||||
|
||||
// Numbers which come from formulas
|
||||
// TODO
|
||||
// assertTrue(text, text.contains("0.40"));
|
||||
// assertTrue(text, text.contains("624"));
|
||||
|
||||
// Formatted numbers
|
||||
// TODO
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue