mirror of https://github.com/apache/poi.git
Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642548 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
63fd48d501
commit
37f001274a
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.poi.hssf.extractor;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.Closeable;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
|
@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLabelRecord;
|
|||
import org.apache.poi.hssf.record.OldStringRecord;
|
||||
import org.apache.poi.hssf.record.RKRecord;
|
||||
import org.apache.poi.hssf.record.RecordInputStream;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.DocumentNode;
|
||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||
import org.apache.poi.ss.usermodel.Cell;
|
||||
|
||||
/**
|
||||
* A text extractor for very old (pre-OLE2) Excel files,
|
||||
* such as Excel 4 files.
|
||||
* A text extractor for old Excel files, which are too old for
|
||||
* HSSFWorkbook to handle. This includes Excel 95, and very old
|
||||
* (pre-OLE2) Excel files, such as Excel 4 files.
|
||||
* <p>
|
||||
* Returns much (but not all) of the textual content of the file,
|
||||
* suitable for indexing by something like Apache Lucene, or used
|
||||
|
@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell;
|
|||
* </p>
|
||||
*/
|
||||
public class OldExcelExtractor {
|
||||
private InputStream input;
|
||||
private RecordInputStream ris;
|
||||
private Closeable input;
|
||||
|
||||
public OldExcelExtractor(InputStream input) {
|
||||
this.input = input;
|
||||
public OldExcelExtractor(InputStream input) throws IOException {
|
||||
BufferedInputStream bstream = new BufferedInputStream(input, 8);
|
||||
if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) {
|
||||
open(new NPOIFSFileSystem(bstream));
|
||||
} else {
|
||||
open(bstream);
|
||||
}
|
||||
}
|
||||
public OldExcelExtractor(File f) throws IOException {
|
||||
this.input = new FileInputStream(f);
|
||||
InputStream input = new FileInputStream(f);
|
||||
if (NPOIFSFileSystem.hasPOIFSHeader(input)) {
|
||||
open(new NPOIFSFileSystem(f));
|
||||
} else {
|
||||
open(input);
|
||||
}
|
||||
}
|
||||
public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException {
|
||||
open(fs);
|
||||
}
|
||||
public OldExcelExtractor(DirectoryNode directory) throws IOException {
|
||||
open(directory);
|
||||
}
|
||||
|
||||
private void open(InputStream biffStream) {
|
||||
input = biffStream;
|
||||
ris = new RecordInputStream(biffStream);
|
||||
}
|
||||
private void open(NPOIFSFileSystem fs) throws IOException {
|
||||
input = fs;
|
||||
open(fs.getRoot());
|
||||
}
|
||||
private void open(DirectoryNode directory) throws IOException {
|
||||
DocumentNode book = (DocumentNode)directory.getEntry("Book");
|
||||
if (book == null) {
|
||||
throw new IOException("No Excel 5/95 Book stream found");
|
||||
}
|
||||
|
||||
ris = new RecordInputStream(directory.createDocumentInputStream(book));
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
@ -66,7 +106,6 @@ public class OldExcelExtractor {
|
|||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
RecordInputStream ris = new RecordInputStream(input);
|
||||
while (ris.hasNextRecord()) {
|
||||
int sid = ris.getNextSid();
|
||||
ris.nextRecord();
|
||||
|
@ -108,6 +147,14 @@ public class OldExcelExtractor {
|
|||
ris.readFully(new byte[ris.remaining()]);
|
||||
}
|
||||
}
|
||||
|
||||
if (input != null) {
|
||||
try {
|
||||
input.close();
|
||||
} catch (IOException e) {}
|
||||
input = null;
|
||||
}
|
||||
ris = null;
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
|
|
|
@ -38,7 +38,9 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
|
|||
SILENT_EXCLUDED.add("46904.xls");
|
||||
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
|
||||
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
|
||||
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
|
||||
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
|
||||
SILENT_EXCLUDED.add("testEXCEL_5.xls"); // Biff 5 / Excel 5
|
||||
SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -24,7 +24,8 @@ import junit.framework.TestCase;
|
|||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
|
||||
/**
|
||||
* Unit tests for the Excel 4 (and older) text extractor
|
||||
* Unit tests for the Excel 5/95 and Excel 4 (and older) text
|
||||
* extractor
|
||||
*/
|
||||
public final class TestOldExcelExtractor extends TestCase {
|
||||
private static OldExcelExtractor createExtractor(String sampleFileName) {
|
||||
|
@ -37,7 +38,7 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testSimple() {
|
||||
public void testSimpleExcel4() {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||
|
||||
// Check we can call getText without error
|
||||
|
@ -51,6 +52,22 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||
assertTrue(text, text.contains("11"));
|
||||
assertTrue(text, text.contains("784"));
|
||||
}
|
||||
public void DISABLEDtestSimpleExcel5() {
|
||||
for (String ver : new String[] {"5", "95"}) {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
|
||||
|
||||
// Check we can call getText without error
|
||||
String text = extractor.getText();
|
||||
|
||||
// Check we find a few words we expect in there
|
||||
assertTrue(text, text.contains("Sample Excel"));
|
||||
assertTrue(text, text.contains("Written and saved"));
|
||||
|
||||
// Check we find a few numbers we expect in there
|
||||
assertTrue(text, text.contains("15"));
|
||||
assertTrue(text, text.contains("169"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testStrings() {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||
|
@ -71,7 +88,7 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||
// TODO Find some then test
|
||||
}
|
||||
|
||||
public void testFormattedNumbers() {
|
||||
public void testFormattedNumbersExcel4() {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||
String text = extractor.getText();
|
||||
|
||||
|
@ -88,4 +105,17 @@ public final class TestOldExcelExtractor extends TestCase {
|
|||
// assertTrue(text, text.contains("55,624"));
|
||||
// assertTrue(text, text.contains("11,743,477"));
|
||||
}
|
||||
public void DISABLEDtestFormattedNumbersExcel5() {
|
||||
for (String ver : new String[] {"5", "95"}) {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls");
|
||||
String text = extractor.getText();
|
||||
|
||||
// Simple numbers
|
||||
assertTrue(text, text.contains("1"));
|
||||
|
||||
// Numbers which come from formulas
|
||||
assertTrue(text, text.contains("13"));
|
||||
assertTrue(text, text.contains("169"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue