Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642548 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2014-11-30 14:22:06 +00:00
parent 63fd48d501
commit 37f001274a
5 changed files with 90 additions and 11 deletions

View File

@ -17,6 +17,8 @@
package org.apache.poi.hssf.extractor; package org.apache.poi.hssf.extractor;
import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLabelRecord;
import org.apache.poi.hssf.record.OldStringRecord; import org.apache.poi.hssf.record.OldStringRecord;
import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.RecordInputStream; import org.apache.poi.hssf.record.RecordInputStream;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Cell;
/** /**
* A text extractor for very old (pre-OLE2) Excel files, * A text extractor for old Excel files, which are too old for
* such as Excel 4 files. * HSSFWorkbook to handle. This includes Excel 95, and very old
* (pre-OLE2) Excel files, such as Excel 4 files.
* <p> * <p>
* Returns much (but not all) of the textual content of the file, * Returns much (but not all) of the textual content of the file,
* suitable for indexing by something like Apache Lucene, or used * suitable for indexing by something like Apache Lucene, or used
@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell;
* </p> * </p>
*/ */
public class OldExcelExtractor { public class OldExcelExtractor {
private InputStream input; private RecordInputStream ris;
private Closeable input;
public OldExcelExtractor(InputStream input) { public OldExcelExtractor(InputStream input) throws IOException {
this.input = input; BufferedInputStream bstream = new BufferedInputStream(input, 8);
if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) {
open(new NPOIFSFileSystem(bstream));
} else {
open(bstream);
}
} }
public OldExcelExtractor(File f) throws IOException { public OldExcelExtractor(File f) throws IOException {
this.input = new FileInputStream(f); InputStream input = new FileInputStream(f);
if (NPOIFSFileSystem.hasPOIFSHeader(input)) {
open(new NPOIFSFileSystem(f));
} else {
open(input);
}
}
public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException {
open(fs);
}
public OldExcelExtractor(DirectoryNode directory) throws IOException {
open(directory);
}
private void open(InputStream biffStream) {
input = biffStream;
ris = new RecordInputStream(biffStream);
}
private void open(NPOIFSFileSystem fs) throws IOException {
input = fs;
open(fs.getRoot());
}
private void open(DirectoryNode directory) throws IOException {
DocumentNode book = (DocumentNode)directory.getEntry("Book");
if (book == null) {
throw new IOException("No Excel 5/95 Book stream found");
}
ris = new RecordInputStream(directory.createDocumentInputStream(book));
} }
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -66,7 +106,6 @@ public class OldExcelExtractor {
public String getText() { public String getText() {
StringBuffer text = new StringBuffer(); StringBuffer text = new StringBuffer();
RecordInputStream ris = new RecordInputStream(input);
while (ris.hasNextRecord()) { while (ris.hasNextRecord()) {
int sid = ris.getNextSid(); int sid = ris.getNextSid();
ris.nextRecord(); ris.nextRecord();
@ -108,6 +147,14 @@ public class OldExcelExtractor {
ris.readFully(new byte[ris.remaining()]); ris.readFully(new byte[ris.remaining()]);
} }
} }
if (input != null) {
try {
input.close();
} catch (IOException e) {}
input = null;
}
ris = null;
return text.toString(); return text.toString();
} }

View File

@ -38,7 +38,9 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
SILENT_EXCLUDED.add("46904.xls"); SILENT_EXCLUDED.add("46904.xls");
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
SILENT_EXCLUDED.add("testEXCEL_5.xls"); // Biff 5 / Excel 5
SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95
} }
@Override @Override

View File

@ -24,7 +24,8 @@ import junit.framework.TestCase;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
/** /**
* Unit tests for the Excel 4 (and older) text extractor * Unit tests for the Excel 5/95 and Excel 4 (and older) text
* extractor
*/ */
public final class TestOldExcelExtractor extends TestCase { public final class TestOldExcelExtractor extends TestCase {
private static OldExcelExtractor createExtractor(String sampleFileName) { private static OldExcelExtractor createExtractor(String sampleFileName) {
@ -37,7 +38,7 @@ public final class TestOldExcelExtractor extends TestCase {
} }
} }
public void testSimple() { public void testSimpleExcel4() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
// Check we can call getText without error // Check we can call getText without error
@ -51,6 +52,22 @@ public final class TestOldExcelExtractor extends TestCase {
assertTrue(text, text.contains("11")); assertTrue(text, text.contains("11"));
assertTrue(text, text.contains("784")); assertTrue(text, text.contains("784"));
} }
public void DISABLEDtestSimpleExcel5() {
for (String ver : new String[] {"5", "95"}) {
OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
// Check we can call getText without error
String text = extractor.getText();
// Check we find a few words we expect in there
assertTrue(text, text.contains("Sample Excel"));
assertTrue(text, text.contains("Written and saved"));
// Check we find a few numbers we expect in there
assertTrue(text, text.contains("15"));
assertTrue(text, text.contains("169"));
}
}
public void testStrings() { public void testStrings() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
@ -71,7 +88,7 @@ public final class TestOldExcelExtractor extends TestCase {
// TODO Find some then test // TODO Find some then test
} }
public void testFormattedNumbers() { public void testFormattedNumbersExcel4() {
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
String text = extractor.getText(); String text = extractor.getText();
@ -88,4 +105,17 @@ public final class TestOldExcelExtractor extends TestCase {
// assertTrue(text, text.contains("55,624")); // assertTrue(text, text.contains("55,624"));
// assertTrue(text, text.contains("11,743,477")); // assertTrue(text, text.contains("11,743,477"));
} }
public void DISABLEDtestFormattedNumbersExcel5() {
for (String ver : new String[] {"5", "95"}) {
OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
String text = extractor.getText();
// Simple numbers
assertTrue(text, text.contains("1"));
// Numbers which come from formulas
assertTrue(text, text.contains("13"));
assertTrue(text, text.contains("169"));
}
}
} }

Binary file not shown.

Binary file not shown.