From 37f001274abe9cf19683a7005912d81766878b78 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Sun, 30 Nov 2014 14:22:06 +0000 Subject: [PATCH] Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1642548 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/hssf/extractor/OldExcelExtractor.java | 61 ++++++++++++++++-- .../apache/poi/hssf/dev/TestBiffViewer.java | 4 +- .../hssf/extractor/TestOldExcelExtractor.java | 36 ++++++++++- test-data/spreadsheet/testEXCEL_5.xls | Bin 0 -> 7168 bytes test-data/spreadsheet/testEXCEL_95.xls | Bin 0 -> 7168 bytes 5 files changed, 90 insertions(+), 11 deletions(-) create mode 100644 test-data/spreadsheet/testEXCEL_5.xls create mode 100644 test-data/spreadsheet/testEXCEL_95.xls diff --git a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java index d78e2268f3..366d8e4996 100644 --- a/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java +++ b/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java @@ -17,6 +17,8 @@ package org.apache.poi.hssf.extractor; +import java.io.BufferedInputStream; +import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLabelRecord; import org.apache.poi.hssf.record.OldStringRecord; import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.RecordInputStream; +import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentNode; +import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.poi.ss.usermodel.Cell; /** - * A text extractor for very old (pre-OLE2) Excel files, - * such as Excel 4 files. + * A text extractor for old Excel files, which are too old for + * HSSFWorkbook to handle. This includes Excel 95, and very old + * (pre-OLE2) Excel files, such as Excel 4 files. *

* Returns much (but not all) of the textual content of the file, * suitable for indexing by something like Apache Lucene, or used @@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell; *

*/ public class OldExcelExtractor { - private InputStream input; + private RecordInputStream ris; + private Closeable input; - public OldExcelExtractor(InputStream input) { - this.input = input; + public OldExcelExtractor(InputStream input) throws IOException { + BufferedInputStream bstream = new BufferedInputStream(input, 8); + if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) { + open(new NPOIFSFileSystem(bstream)); + } else { + open(bstream); + } } public OldExcelExtractor(File f) throws IOException { - this.input = new FileInputStream(f); + InputStream input = new FileInputStream(f); + if (NPOIFSFileSystem.hasPOIFSHeader(input)) { + open(new NPOIFSFileSystem(f)); + } else { + open(input); + } + } + public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException { + open(fs); + } + public OldExcelExtractor(DirectoryNode directory) throws IOException { + open(directory); + } + + private void open(InputStream biffStream) { + input = biffStream; + ris = new RecordInputStream(biffStream); + } + private void open(NPOIFSFileSystem fs) throws IOException { + input = fs; + open(fs.getRoot()); + } + private void open(DirectoryNode directory) throws IOException { + DocumentNode book = (DocumentNode)directory.getEntry("Book"); + if (book == null) { + throw new IOException("No Excel 5/95 Book stream found"); + } + + ris = new RecordInputStream(directory.createDocumentInputStream(book)); } public static void main(String[] args) throws Exception { @@ -66,7 +106,6 @@ public class OldExcelExtractor { public String getText() { StringBuffer text = new StringBuffer(); - RecordInputStream ris = new RecordInputStream(input); while (ris.hasNextRecord()) { int sid = ris.getNextSid(); ris.nextRecord(); @@ -108,6 +147,14 @@ public class OldExcelExtractor { ris.readFully(new byte[ris.remaining()]); } } + + if (input != null) { + try { + input.close(); + } catch (IOException e) {} + input = null; + } + ris = null; return text.toString(); } diff --git a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java index b7013c1503..e5062a0241 100644 --- a/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java +++ b/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java @@ -38,7 +38,9 @@ public class TestBiffViewer extends BaseXLSIteratingTest { SILENT_EXCLUDED.add("46904.xls"); SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api header SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported XOR-encryption - SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 + SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2 + SILENT_EXCLUDED.add("testEXCEL_5.xls"); // Biff 5 / Excel 5 + SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95 } @Override diff --git a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java index f6c36e16c1..8c58f9e4f3 100644 --- a/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java +++ b/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java @@ -24,7 +24,8 @@ import junit.framework.TestCase; import org.apache.poi.hssf.HSSFTestDataSamples; /** - * Unit tests for the Excel 4 (and older) text extractor + * Unit tests for the Excel 5/95 and Excel 4 (and older) text + * extractor */ public final class TestOldExcelExtractor extends TestCase { private static OldExcelExtractor createExtractor(String sampleFileName) { @@ -37,7 +38,7 @@ public final class TestOldExcelExtractor extends TestCase { } } - public void testSimple() { + public void testSimpleExcel4() { OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); // Check we can call getText without error @@ -51,6 +52,22 @@ public final class TestOldExcelExtractor extends TestCase { assertTrue(text, text.contains("11")); assertTrue(text, text.contains("784")); } + public void DISABLEDtestSimpleExcel5() { + for (String ver : new String[] {"5", "95"}) { + OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls"); + + // Check we can call getText without error + String text = extractor.getText(); + + // Check we find a few words we expect in there + assertTrue(text, text.contains("Sample Excel")); + assertTrue(text, text.contains("Written and saved")); + + // Check we find a few numbers we expect in there + assertTrue(text, text.contains("15")); + assertTrue(text, text.contains("169")); + } + } public void testStrings() { OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); @@ -71,7 +88,7 @@ public final class TestOldExcelExtractor extends TestCase { // TODO Find some then test } - public void testFormattedNumbers() { + public void testFormattedNumbersExcel4() { OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls"); String text = extractor.getText(); @@ -88,4 +105,17 @@ public final class TestOldExcelExtractor extends TestCase { // assertTrue(text, text.contains("55,624")); // assertTrue(text, text.contains("11,743,477")); } + public void DISABLEDtestFormattedNumbersExcel5() { + for (String ver : new String[] {"5", "95"}) { + OldExcelExtractor extractor = createExtractor("testEXCEL_"+ver+".xls"); + String text = extractor.getText(); + + // Simple numbers + assertTrue(text, text.contains("1")); + + // Numbers which come from formulas + assertTrue(text, text.contains("13")); + assertTrue(text, text.contains("169")); + } + } } diff --git a/test-data/spreadsheet/testEXCEL_5.xls b/test-data/spreadsheet/testEXCEL_5.xls new file mode 100644 index 0000000000000000000000000000000000000000..ed69b3d30f8a0b7d4c39ffad1a387768d67b6419 GIT binary patch literal 7168 zcmeHLU2IfE6rQ{P+m?2F%TEQwD;+dO&fqq*lrRaz^8;z~I#A|APcdcFFc5F&s&Wmn4OvJ#^@oo*u! zTz31)F$`f4!vM!T3dDdoPzUgNgInH4uQ%P*DLl|G#_(4Zmgp0Clm+o3M!EJh8#Bh+ zL@k!ycu|Y~R65h`YY$B2D?a!7pYfYT`e)qd`kw(b0!;w60MQJXKnpMvm<7xR z<^Zk0UBFym9xxxc8(09`16aVlz(QaVuo$=xSOT;G?Z8r?1Ly>jz%rlpz1co6l6fYrbnSH9LQ*SV$k*qvRA#}MViVUde}9X$`X2w3^FFKJfQS_z|mE zbN#sZvuufz7R-?-V`m}zV=5UtU#;gSBX70d?I}AB-&Fgmlkax%T3;>oktKa(9X_)7 zKV%V%8QP6`+o;GEwR&Yj>a|~$9SI=zl|-3kq7;;6b15QAvlNnLODQVLnWb>~AmZRC zL#BvqbSAR7ZVFEDoa72tQ1GQZELSvE@?mAMCFEk8A*X)>@*xCME`?^!drYdB`0Ksj z&Tbvt(etd7m*hc=+z5~Cs3?m!5v-H=dled|LLJhwQ|7Np8x@LQm$Elxz63lMer5W( zr_EDuNBwNuOZdOgv(!^RhQJ(0@3?>sFeHtnQGen&{7?3%(hyJL3EnNh_haB^|6&1d z4XnHO0;5UFg5U{O$-+Leh>tAlBa8XS;y$uEA6dPRtieY%!$;QWBWv=JF+fzmZ?lig z^pUmr$hf&0;!@>K2A_rk4@*DK!WKA?9^IF7tiA&oCui*%E9@N~ah#&H%-S|FI_MO} zt#p3KDvmhWf;I5MM7rRNe=aw416wx#f?#VzZeSE6qGD8$1f-;5^fJboBh~~RUKA|+ z3dq9%J94pyGnf|!8%id#a0FXD$CC#JnR+Wa@rup@c!)F+4^6^V0SzooW|D?eK+(iB z4RsVv)I+mS)4}Dwq-kJj+Q=jgSAe3i zG!1nW4MUbv*`aA*Y1+sn4Hu2lQNsY=P)E@){<$i>J2ed~O&gh{;W|?^i(MViP)E`5 z6rog})-pKiH_vDqSeiC6NyF8wXxcRmbrelP(}+P$1549J zCTSP|6itVwp^l>APT;Coj;4X7X(N*~j2Vh1scERAXiQC08qqYcG;L&(hG9n0bZHvu zD4G^cW92jrEKM6(I270-kNtB*O6)3Ri$y0dgUxt)zcXZI^VZgErZ6@>He7V0({5{c ztYB?TXRHCIus@q|tbQlwq{khryQ}U|M3@sY!r@?!>p~Ob1GicOF#@&Vov@L*p;Aa^ z$RAgoQF&yz8q>~zBfQRV{ZexQ+sb}av&FTFS8&4N;ix*XPm>4V`Tf%Nk;MCN#YAV@ z$KUb3`UAjkYJ9!}pyh6W$EE`SzquU-sPits0sa8sQSuXjckr(O-owuTwEyLDX^AOpH4DBILYt^WOfrlP!)|DJP$Iib~FJef^tLecNzZrXD%j*Iq!SUR6OU z>z_uOH-=6dte1@7p-rfxdc(=Ut({Anx9~)aEem<`M6r?I3#wzss;=eBcBTiNT+Tr> z5F(Ww9$tm=X4Cb?k39*T`{vxyu6c7?ZM$Gv`PHOY83qQA+>IaHGol+JJ=AsJ%3$O)(N<0f*p z&vsR+h2jAy@q)$_76W2JjN&gXzrAk8SJ+|rq#z|lkwraUQT5H+(LQ4bzuU79zDiX> zu9pUPwHjgrS{N1k;D5PB ze|XbG!-Fp*hNLe>V`5BrG79lUAB=_s6MZVi#P}j;^npZJzwg{Ty_fZtF6ax|X}>eG z=brgy?wOf0GxwXX8>dfw)N(;pq^&Bdu9j0O5zreb*XiGBr6MR(cePwDYc-1V=@#<9 z6|b)x!x#oJ4sgtCfFzItY5_i9;N`c_>&3AS^<(TLWW9Po<^wABC~Gq^o)Hd3dW82)kt;>kLmKLPDR*J+d;!by%WoOTFd^gH z3b}0F*M9;XQ&1n#_96N(`x*X#{SvQT+8aR4V@_cxJCe>WqczNYt^j(EUD_j-9Wx3)7Le((cyOO~E&nzUtt+UAop+YeRII5M6tSF7-EE z0%L}DlfgDz-KyrGPKjOHjP67Pv9F}c%+sZ)C_|=7 z>~JP?`7SC>^PJ=+jeO`XwzMp^g);pYKn@|8aw#-&-eXcF)t~SEdT#f? zzV4?)Uy_3uxe*@OQPpC+fnc4%-z%^<8*7)E9U@Auy-VJ1$@Y3>hP1)SZ4B|I^(%H`Jqef_IDX{TTS!zhs141IsOj z6pRMZMZpuT(8WV^i4a{)h%OnTONHoaLv(c^x&#RMKqXSN1!pe>f zS;Y}2SFrk@oy-=TiBDxiH?U>%FABDLNCTsoP;Mnx6A_JD=_QOaN2~!hyeL@s6_LXL zJ94R;Gnf|!8%8EYJb|sAhL z1ZYhL255SH8q_px2%+J0)-=m~8ro=@I-jQWuup@UrVSx9TmhQK@@Z(J zX&AD!$v&S3HBB2rXt-#!jUER0hBlgp@y|2qIpEWvrfEY64cD2bS?$??hBlgprwDEG zxKD$crVSx9T$Sk7)1anlL&Rf|opS77s(RJILatbJ#w6HGWRE#RR&LDNof|BSPmB*2z36nv8XhlL zyR(B#xwN0o^KG=2Qt-!AVR zNx%PgQgyU`{2lMBF94j~e9qeuzpwQG?DZ~yN5(?{@7g?i@xFZ&U<*HAaqrG7ldA{N zsYe%(&sGkY%oGjWnz^KS7fdI~tkfBiY6tJ@m1}oX=lXRAvI9;&@5p=f=7xtix%M|D zXY(I>5;_0X`IDW?(kI@5|1y_9w4sYv=ng~I4A7qh51y<`U(Ke+NSIkwI-ZR<-sI#QuBsA@zU2Z#GSps`SC0GRCM4TuyE;`vaSx1m zL1T)mel@8^@t2ivuaDp>>@a*%5KU3#P#$wlLuDJ<=aq}!?b(M=qY5S0>j$3BP}|YM zC}PZBHK3lUu$=oR&lvjMt8JfgZ>L8cS2o%o6dyS91^UMPvH2$2=cTDQ_p4QHaii}a P{Qg>XBiAwfC++_Q