mirror of https://github.com/apache/poi.git
Start on a Text Extractor for the pre-OLE2 Excel formats like Excel 4, for TIKA-1490
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642490 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5d3db739db
commit
73bd034c79
|
@ -39,7 +39,7 @@
|
|||
# 2. cd build/dist
|
||||
# 3. ./mvn-deploy.sh
|
||||
|
||||
M2_REPOSITORY=M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
|
||||
M2_REPOSITORY=https://repository.apache.org/service/local/staging/deploy/maven2
|
||||
|
||||
VERSION=@VERSION@
|
||||
DSTAMP=@DSTAMP@
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hssf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.hssf.record.LabelRecord;
|
||||
import org.apache.poi.hssf.record.OldLabelRecord;
|
||||
import org.apache.poi.hssf.record.RecordInputStream;
|
||||
|
||||
/**
|
||||
* A text extractor for very old (pre-OLE2) Excel files,
|
||||
* such as Excel 4 files.
|
||||
* <p>
|
||||
* Returns much (but not all) of the textual content of the file,
|
||||
* suitable for indexing by something like Apache Lucene, or used
|
||||
* by Apache Tika, but not really intended for display to the user.
|
||||
* </p>
|
||||
*/
|
||||
public class OldExcelExtractor {
|
||||
private InputStream input;
|
||||
private boolean _includeSheetNames = true;
|
||||
|
||||
public OldExcelExtractor(InputStream input) {
|
||||
this.input = input;
|
||||
}
|
||||
public OldExcelExtractor(File f) throws IOException {
|
||||
this.input = new FileInputStream(f);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" OldExcelExtractor <filename>");
|
||||
System.exit(1);
|
||||
}
|
||||
OldExcelExtractor extractor = new OldExcelExtractor(new File(args[0]));
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
|
||||
/**
|
||||
* Should sheet names be included? Default is true
|
||||
*/
|
||||
public void setIncludeSheetNames(boolean includeSheetNames) {
|
||||
_includeSheetNames = includeSheetNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the text contents of the file, as best we can
|
||||
* for these old file formats
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
RecordInputStream ris = new RecordInputStream(input);
|
||||
while (ris.hasNextRecord()) {
|
||||
int sid = ris.getNextSid();
|
||||
ris.nextRecord();
|
||||
|
||||
switch (sid) {
|
||||
case LabelRecord.sid:
|
||||
OldLabelRecord lr = new OldLabelRecord(ris);
|
||||
text.append(lr.getValue());
|
||||
text.append('\n');
|
||||
break;
|
||||
default:
|
||||
ris.readFully(new byte[ris.remaining()]);
|
||||
}
|
||||
|
||||
// label - 5.63 - TODO Needs codepages
|
||||
// number - 5.71
|
||||
// rk - 5.87
|
||||
// string - 5.102
|
||||
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
|
@ -25,9 +25,7 @@ import org.apache.poi.util.POILogger;
|
|||
* Label Record (0x0204) - read only support for strings stored directly in the cell.. Don't
|
||||
* use this (except to read), use LabelSST instead <P>
|
||||
* REFERENCE: PG 325 Microsoft Excel 97 Developer's Kit (ISBN: 1-57231-498-2)<P>
|
||||
* @author Andrew C. Oliver (acoliver at apache dot org)
|
||||
* @author Jason Height (jheight at chariot dot net dot au)
|
||||
* @version 2.0-pre
|
||||
*
|
||||
* @see org.apache.poi.hssf.record.LabelSSTRecord
|
||||
*/
|
||||
public final class LabelRecord extends Record implements CellValueRecordInterface {
|
||||
|
|
|
@ -0,0 +1,168 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hssf.record;
|
||||
|
||||
import org.apache.poi.util.HexDump;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* Biff2 - Biff 4 Label Record (0x0004 / 0x0204) - read only support for
|
||||
* strings stored directly in the cell, from the older file formats that
|
||||
* didn't use {@link LabelSSTRecord}
|
||||
*/
|
||||
public final class OldLabelRecord extends Record implements CellValueRecordInterface {
|
||||
private final static POILogger logger = POILogFactory.getLogger(OldLabelRecord.class);
|
||||
|
||||
public final static short biff2_sid = 0x0004;
|
||||
public final static short biff345_sid = 0x0204;
|
||||
|
||||
private short sid;
|
||||
private int field_1_row;
|
||||
private short field_2_column;
|
||||
private int field_3_cell_attrs; // Biff 2
|
||||
private short field_3_xf_index; // Biff 3+
|
||||
private short field_4_string_len;
|
||||
private byte[] field_5_bytes;
|
||||
//private XXXXX codepage; // TODO
|
||||
|
||||
/**
|
||||
* @param in the RecordInputstream to read the record from
|
||||
*/
|
||||
public OldLabelRecord(RecordInputStream in)
|
||||
{
|
||||
sid = in.getSid();
|
||||
|
||||
field_1_row = in.readUShort();
|
||||
field_2_column = in.readShort();
|
||||
|
||||
if (in.getSid() == biff2_sid) {
|
||||
field_3_cell_attrs = in.readUShort() << 8;
|
||||
field_3_cell_attrs += in.readUByte();
|
||||
field_4_string_len = (short)in.readUByte();
|
||||
} else {
|
||||
field_3_xf_index = in.readShort();
|
||||
field_4_string_len = in.readShort();
|
||||
}
|
||||
|
||||
// Can only decode properly later when you know the codepage
|
||||
field_5_bytes = new byte[field_4_string_len];
|
||||
in.read(field_5_bytes, 0, field_4_string_len);
|
||||
|
||||
if (in.remaining() > 0) {
|
||||
logger.log(POILogger.INFO,
|
||||
"LabelRecord data remains: " + in.remaining() +
|
||||
" : " + HexDump.toHex(in.readRemainder())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isBiff2() {
|
||||
return sid == biff2_sid;
|
||||
}
|
||||
|
||||
public int getRow()
|
||||
{
|
||||
return field_1_row;
|
||||
}
|
||||
|
||||
public short getColumn()
|
||||
{
|
||||
return field_2_column;
|
||||
}
|
||||
|
||||
public short getXFIndex()
|
||||
{
|
||||
return field_3_xf_index;
|
||||
}
|
||||
public int getCellAttrs()
|
||||
{
|
||||
return field_3_cell_attrs;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the number of characters this string contains
|
||||
* @return number of characters
|
||||
*/
|
||||
public short getStringLength()
|
||||
{
|
||||
return field_4_string_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the String of the cell
|
||||
*/
|
||||
public String getValue()
|
||||
{
|
||||
// We really need the codepage here to do this right...
|
||||
return new String(field_5_bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Not supported
|
||||
*/
|
||||
public int serialize(int offset, byte [] data) {
|
||||
throw new RecordFormatException("Old Label Records are supported READ ONLY");
|
||||
}
|
||||
public int getRecordSize() {
|
||||
throw new RecordFormatException("Old Label Records are supported READ ONLY");
|
||||
}
|
||||
|
||||
public short getSid()
|
||||
{
|
||||
return sid;
|
||||
}
|
||||
|
||||
public String toString()
|
||||
{
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append("[OLD LABEL]\n");
|
||||
sb.append(" .row = ").append(HexDump.shortToHex(getRow())).append("\n");
|
||||
sb.append(" .column = ").append(HexDump.shortToHex(getColumn())).append("\n");
|
||||
if (isBiff2()) {
|
||||
sb.append(" .cellattrs = ").append(HexDump.shortToHex(getCellAttrs())).append("\n");
|
||||
} else {
|
||||
sb.append(" .xfindex = ").append(HexDump.shortToHex(getXFIndex())).append("\n");
|
||||
}
|
||||
sb.append(" .string_len= ").append(HexDump.shortToHex(field_4_string_len)).append("\n");
|
||||
sb.append(" .value = ").append(getValue()).append("\n");
|
||||
sb.append("[/OLD LABEL]\n");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* NO-OP!
|
||||
*/
|
||||
public void setColumn(short col)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* NO-OP!
|
||||
*/
|
||||
public void setRow(int row)
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* no op!
|
||||
*/
|
||||
public void setXFIndex(short xf)
|
||||
{
|
||||
}
|
||||
}
|
|
@ -38,6 +38,7 @@ public class TestBiffViewer extends BaseXLSIteratingTest {
|
|||
SILENT_EXCLUDED.add("46904.xls");
|
||||
SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header
|
||||
SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption
|
||||
SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hssf.extractor;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.poi.hssf.HSSFTestDataSamples;
|
||||
|
||||
/**
|
||||
* Unit tests for the Excel 4 (and older) text extractor
|
||||
*/
|
||||
public final class TestOldExcelExtractor extends TestCase {
|
||||
private static OldExcelExtractor createExtractor(String sampleFileName) {
|
||||
InputStream is = HSSFTestDataSamples.openSampleFileStream(sampleFileName);
|
||||
|
||||
try {
|
||||
return new OldExcelExtractor(is);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void testSimple() {
|
||||
OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
|
||||
|
||||
// Check we can call getText without error
|
||||
String text = extractor.getText();
|
||||
|
||||
// Check we find a few words we expect in there
|
||||
assertTrue(text, text.contains("Size"));
|
||||
assertTrue(text, text.contains("Returns"));
|
||||
}
|
||||
|
||||
// TODO Rest of the tests
|
||||
}
|
Binary file not shown.
Loading…
Reference in New Issue