mirror of https://github.com/apache/poi.git
Implement an Excel text extractor, and put all the existing text extractors under a common superclass, so they're easier to find and use
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@589224 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a4d5c5a75
commit
6a72c5656a
|
@ -0,0 +1,49 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi;
|
||||
|
||||
/**
|
||||
* Common Parent for Text Extractors
|
||||
* of POI Documents.
|
||||
* You will typically find the implementation of
|
||||
* a given format's text extractor under
|
||||
* org.apache.poi.[format].extractor .
|
||||
* @see org.apache.poi.hssf.extractor.ExcelExtractor
|
||||
* @see org.apache.poi.hslf.extractor.PowerPointExtractor
|
||||
* @see org.apache.poi.hdgf.extractor.VisioTextExtractor
|
||||
* @see org.apache.poi.hwpf.extractor.WordExtractor
|
||||
*/
|
||||
public abstract class POITextExtractor {
|
||||
/** The POIDocument that's open */
|
||||
protected POIDocument document;
|
||||
|
||||
/**
|
||||
* Creates a new text extractor for the given document
|
||||
*/
|
||||
public POITextExtractor(POIDocument document) {
|
||||
this.document = document;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all the text from the document.
|
||||
* How cells, paragraphs etc are separated in the text
|
||||
* is implementation specific - see the javadocs for
|
||||
* a specific project for details.
|
||||
* @return All the text from the document
|
||||
*/
|
||||
public abstract String getText();
|
||||
}
|
|
@ -0,0 +1,144 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hssf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.hssf.usermodel.HSSFCell;
|
||||
import org.apache.poi.hssf.usermodel.HSSFRichTextString;
|
||||
import org.apache.poi.hssf.usermodel.HSSFRow;
|
||||
import org.apache.poi.hssf.usermodel.HSSFSheet;
|
||||
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* A text extractor for Excel files.
|
||||
* Returns the textual content of the file, suitable for
|
||||
* indexing by something like Lucene, but not really
|
||||
* intended for display to the user.
|
||||
* To turn an excel file into a CSV or similar, then see
|
||||
* the XLS2CSVmra example
|
||||
* @see org.apache.poi.hssf.eventusermodel.examples.XLS2CSVmra
|
||||
*/
|
||||
public class ExcelExtractor extends POITextExtractor{
|
||||
private HSSFWorkbook wb;
|
||||
private boolean includeSheetNames = true;
|
||||
private boolean formulasNotResults = false;
|
||||
|
||||
public ExcelExtractor(HSSFWorkbook wb) {
|
||||
super(wb);
|
||||
this.wb = wb;
|
||||
}
|
||||
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HSSFWorkbook(fs));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Should sheet names be included? Default is true
|
||||
*/
|
||||
public void setIncludeSheetNames(boolean includeSheetNames) {
|
||||
this.includeSheetNames = includeSheetNames;
|
||||
}
|
||||
/**
|
||||
* Should we return the formula itself, and not
|
||||
* the result it produces? Default is false
|
||||
*/
|
||||
public void setFormulasNotResults(boolean formulasNotResults) {
|
||||
this.formulasNotResults = formulasNotResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retreives the text contents of the file
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
||||
for(int i=0;i<wb.getNumberOfSheets();i++) {
|
||||
HSSFSheet sheet = wb.getSheetAt(i);
|
||||
if(sheet == null) { continue; }
|
||||
|
||||
if(includeSheetNames) {
|
||||
String name = wb.getSheetName(i);
|
||||
if(name != null) {
|
||||
text.append(name);
|
||||
text.append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
int firstRow = sheet.getFirstRowNum();
|
||||
int lastRow = sheet.getLastRowNum();
|
||||
for(int j=firstRow;j<=lastRow;j++) {
|
||||
HSSFRow row = sheet.getRow(j);
|
||||
if(row == null) { continue; }
|
||||
|
||||
// Check each cell in turn
|
||||
int firstCell = row.getFirstCellNum();
|
||||
int lastCell = row.getLastCellNum();
|
||||
for(int k=firstCell;k<lastCell;k++) {
|
||||
HSSFCell cell = row.getCell((short)k);
|
||||
boolean outputContents = false;
|
||||
if(cell == null) { continue; }
|
||||
|
||||
switch(cell.getCellType()) {
|
||||
case HSSFCell.CELL_TYPE_STRING:
|
||||
text.append(cell.getRichStringCellValue().getString());
|
||||
outputContents = true;
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_NUMERIC:
|
||||
// Note - we don't apply any formatting!
|
||||
text.append(cell.getNumericCellValue());
|
||||
outputContents = true;
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_BOOLEAN:
|
||||
text.append(cell.getBooleanCellValue());
|
||||
outputContents = true;
|
||||
break;
|
||||
case HSSFCell.CELL_TYPE_FORMULA:
|
||||
if(formulasNotResults) {
|
||||
text.append(cell.getCellFormula());
|
||||
} else {
|
||||
// Try it as a string, if not as a number
|
||||
HSSFRichTextString str =
|
||||
cell.getRichStringCellValue();
|
||||
if(str != null && str.length() > 0) {
|
||||
text.append(str.toString());
|
||||
} else {
|
||||
// Try and treat it as a number
|
||||
double val = cell.getNumericCellValue();
|
||||
text.append(val);
|
||||
}
|
||||
}
|
||||
outputContents = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Output a tab if we're not on the last cell
|
||||
if(outputContents && k < (lastCell-1)) {
|
||||
text.append("\t");
|
||||
}
|
||||
}
|
||||
|
||||
// Finish off the row
|
||||
text.append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
return text.toString();
|
||||
}
|
||||
}
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.hdgf.HDGFDiagram;
|
||||
import org.apache.poi.hdgf.chunks.Chunk.Command;
|
||||
import org.apache.poi.hdgf.streams.ChunkStream;
|
||||
|
@ -33,11 +34,12 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
* Can opperate on the command line (outputs to stdout), or
|
||||
* can return the text for you (eg for use with Lucene).
|
||||
*/
|
||||
public class VisioTextExtractor {
|
||||
public class VisioTextExtractor extends POITextExtractor {
|
||||
private HDGFDiagram hdgf;
|
||||
private POIFSFileSystem fs;
|
||||
|
||||
public VisioTextExtractor(HDGFDiagram hdgf) {
|
||||
super(hdgf);
|
||||
this.hdgf = hdgf;
|
||||
}
|
||||
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||
|
@ -84,6 +86,8 @@ public class VisioTextExtractor {
|
|||
|
||||
/**
|
||||
* Returns the textual contents of the file.
|
||||
* Each textual object's text will be separated
|
||||
* by a newline
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
|
|
|
@ -22,6 +22,8 @@ package org.apache.poi.hslf.extractor;
|
|||
|
||||
import java.io.*;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.hslf.*;
|
||||
import org.apache.poi.hslf.model.*;
|
||||
|
@ -34,12 +36,12 @@ import org.apache.poi.hslf.usermodel.*;
|
|||
* @author Nick Burch
|
||||
*/
|
||||
|
||||
public class PowerPointExtractor
|
||||
public class PowerPointExtractor extends POITextExtractor
|
||||
{
|
||||
private HSLFSlideShow _hslfshow;
|
||||
private SlideShow _show;
|
||||
private Slide[] _slides;
|
||||
private Notes[] _notes;
|
||||
private HSLFSlideShow _hslfshow;
|
||||
private SlideShow _show;
|
||||
private Slide[] _slides;
|
||||
private Notes[] _notes;
|
||||
|
||||
/**
|
||||
* Basic extractor. Returns all the text, and optionally all the notes
|
||||
|
@ -66,61 +68,50 @@ public class PowerPointExtractor
|
|||
ppe.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a file
|
||||
* @param fileName The name of the file to extract from
|
||||
*/
|
||||
public PowerPointExtractor(String fileName) throws IOException {
|
||||
_hslfshow = new HSLFSlideShow(fileName);
|
||||
_show = new SlideShow(_hslfshow);
|
||||
_slides = _show.getSlides();
|
||||
_notes = _show.getNotes();
|
||||
}
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a file
|
||||
* @param fileName The name of the file to extract from
|
||||
*/
|
||||
public PowerPointExtractor(String fileName) throws IOException {
|
||||
this(new FileInputStream(fileName));
|
||||
}
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from an Input Stream
|
||||
* @param iStream The input stream containing the PowerPoint document
|
||||
*/
|
||||
public PowerPointExtractor(InputStream iStream) throws IOException {
|
||||
this(new POIFSFileSystem(iStream));
|
||||
}
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from an open POIFSFileSystem
|
||||
* @param fs the POIFSFileSystem containing the PowerPoint document
|
||||
*/
|
||||
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HSLFSlideShow(fs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from an Input Stream
|
||||
* @param iStream The input stream containing the PowerPoint document
|
||||
*/
|
||||
public PowerPointExtractor(InputStream iStream) throws IOException {
|
||||
_hslfshow = new HSLFSlideShow(iStream);
|
||||
_show = new SlideShow(_hslfshow);
|
||||
_slides = _show.getSlides();
|
||||
_notes = _show.getNotes();
|
||||
}
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a HSLFSlideShow
|
||||
* @param ss the HSLFSlideShow to extract text from
|
||||
*/
|
||||
public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
|
||||
super(ss);
|
||||
_hslfshow = ss;
|
||||
_show = new SlideShow(_hslfshow);
|
||||
_slides = _show.getSlides();
|
||||
_notes = _show.getNotes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from an open POIFSFileSystem
|
||||
* @param fs the POIFSFileSystem containing the PowerPoint document
|
||||
*/
|
||||
public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
|
||||
_hslfshow = new HSLFSlideShow(fs);
|
||||
_show = new SlideShow(_hslfshow);
|
||||
_slides = _show.getSlides();
|
||||
_notes = _show.getNotes();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a PowerPointExtractor, from a HSLFSlideShow
|
||||
* @param ss the HSLFSlideShow to extract text from
|
||||
*/
|
||||
public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
|
||||
_hslfshow = ss;
|
||||
_show = new SlideShow(_hslfshow);
|
||||
_slides = _show.getSlides();
|
||||
_notes = _show.getNotes();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Shuts down the underlying streams
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
_hslfshow.close();
|
||||
_hslfshow = null;
|
||||
_show = null;
|
||||
_slides = null;
|
||||
_notes = null;
|
||||
}
|
||||
/**
|
||||
* Shuts down the underlying streams
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
_hslfshow.close();
|
||||
_hslfshow = null;
|
||||
_show = null;
|
||||
_slides = null;
|
||||
_notes = null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
|
@ -195,4 +186,4 @@ public class PowerPointExtractor
|
|||
|
||||
return ret.toString();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -22,6 +22,7 @@ import java.io.FileInputStream;
|
|||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.POITextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
|
@ -36,7 +37,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
*
|
||||
* @author Nick Burch (nick at torchbox dot com)
|
||||
*/
|
||||
public class WordExtractor {
|
||||
public class WordExtractor extends POITextExtractor {
|
||||
private POIFSFileSystem fs;
|
||||
private HWPFDocument doc;
|
||||
|
||||
|
@ -62,6 +63,7 @@ public class WordExtractor {
|
|||
* @param doc The HWPFDocument to extract from
|
||||
*/
|
||||
public WordExtractor(HWPFDocument doc) throws IOException {
|
||||
super(doc);
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hssf.extractor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestExcelExtractor extends TestCase {
|
||||
public void testSimple() throws Exception {
|
||||
String path = System.getProperty("HSSF.testdata.path");
|
||||
FileInputStream fin = new FileInputStream(path + File.separator + "Simple.xls");
|
||||
|
||||
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
|
||||
|
||||
assertEquals("Sheet1\nreplaceMe\nSheet2\nSheet3\n", extractor.getText());
|
||||
|
||||
// Now turn off sheet names
|
||||
extractor.setIncludeSheetNames(false);
|
||||
assertEquals("replaceMe\n", extractor.getText());
|
||||
}
|
||||
|
||||
public void testNumericFormula() throws Exception {
|
||||
String path = System.getProperty("HSSF.testdata.path");
|
||||
FileInputStream fin = new FileInputStream(path + File.separator + "sumifformula.xls");
|
||||
|
||||
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
|
||||
|
||||
assertEquals(
|
||||
"Sheet1\n" +
|
||||
"1000.0\t1.0\t5.0\n" +
|
||||
"2000.0\t2.0\t\n" +
|
||||
"3000.0\t3.0\t\n" +
|
||||
"4000.0\t4.0\t\n" +
|
||||
"5000.0\t5.0\t\n" +
|
||||
"Sheet2\nSheet3\n",
|
||||
extractor.getText()
|
||||
);
|
||||
|
||||
extractor.setFormulasNotResults(true);
|
||||
|
||||
assertEquals(
|
||||
"Sheet1\n" +
|
||||
"1000.0\t1.0\tSUMIF(A1:A5,\">4000\",B1:B5)\n" +
|
||||
"2000.0\t2.0\t\n" +
|
||||
"3000.0\t3.0\t\n" +
|
||||
"4000.0\t4.0\t\n" +
|
||||
"5000.0\t5.0\t\n" +
|
||||
"Sheet2\nSheet3\n",
|
||||
extractor.getText()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void testStringConcat() throws Exception {
|
||||
String path = System.getProperty("HSSF.testdata.path");
|
||||
FileInputStream fin = new FileInputStream(path + File.separator + "SimpleWithFormula.xls");
|
||||
|
||||
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
|
||||
|
||||
// Comes out as NaN if treated as a number
|
||||
// And as XYZ if treated as a string
|
||||
assertEquals("Sheet1\nreplaceme\nreplaceme\nreplacemereplaceme\nSheet2\nSheet3\n", extractor.getText());
|
||||
|
||||
extractor.setFormulasNotResults(true);
|
||||
|
||||
assertEquals("Sheet1\nreplaceme\nreplaceme\nCONCATENATE(A1,A2)\nSheet2\nSheet3\n", extractor.getText());
|
||||
}
|
||||
|
||||
public void testStringFormula() throws Exception {
|
||||
String path = System.getProperty("HSSF.testdata.path");
|
||||
FileInputStream fin = new FileInputStream(path + File.separator + "StringFormulas.xls");
|
||||
|
||||
ExcelExtractor extractor = new ExcelExtractor(new POIFSFileSystem(fin));
|
||||
|
||||
// Comes out as NaN if treated as a number
|
||||
// And as XYZ if treated as a string
|
||||
assertEquals("Sheet1\nXYZ\nSheet2\nSheet3\n", extractor.getText());
|
||||
|
||||
extractor.setFormulasNotResults(true);
|
||||
|
||||
assertEquals("Sheet1\nUPPER(\"xyz\")\nSheet2\nSheet3\n", extractor.getText());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue