mirror of https://github.com/apache/poi.git
Make a start on a text extractor for xlsx files
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@607058 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7d409e8139
commit
5bbf6f70f6
|
@ -16,6 +16,8 @@
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi;
|
package org.apache.poi;
|
||||||
|
|
||||||
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parent class of all UserModel POI XML (ooxml)
|
* Parent class of all UserModel POI XML (ooxml)
|
||||||
* implementations.
|
* implementations.
|
||||||
|
@ -23,5 +25,9 @@ package org.apache.poi;
|
||||||
* for the XML based classes.
|
* for the XML based classes.
|
||||||
*/
|
*/
|
||||||
public abstract class POIXMLDocument {
|
public abstract class POIXMLDocument {
|
||||||
// TODO
|
private HXFDocument document;
|
||||||
|
|
||||||
|
protected POIXMLDocument(HXFDocument document) {
|
||||||
|
this.document = document;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi;
|
||||||
|
|
||||||
|
public abstract class POIXMLTextExtractor extends POITextExtractor {
|
||||||
|
/** The POIXMLDocument that's open */
|
||||||
|
protected POIXMLDocument document;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new text extractor for the given document
|
||||||
|
*/
|
||||||
|
public POIXMLTextExtractor(POIXMLDocument document) {
|
||||||
|
super(null);
|
||||||
|
|
||||||
|
this.document = document;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,113 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hssf.extractor;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
|
import org.apache.poi.hssf.HSSFXML;
|
||||||
|
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
|
||||||
|
import org.apache.xmlbeans.XmlException;
|
||||||
|
import org.openxml4j.exceptions.OpenXML4JException;
|
||||||
|
import org.openxml4j.opc.Package;
|
||||||
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCell;
|
||||||
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTCellFormula;
|
||||||
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRow;
|
||||||
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTSheet;
|
||||||
|
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorksheet;
|
||||||
|
|
||||||
|
public class HXFExcelExtractor extends POIXMLTextExtractor {
|
||||||
|
private HSSFXMLWorkbook workbook;
|
||||||
|
private boolean includeSheetNames = true;
|
||||||
|
private boolean formulasNotResults = false;
|
||||||
|
|
||||||
|
public HXFExcelExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
|
||||||
|
this(new HSSFXMLWorkbook(
|
||||||
|
new HSSFXML(container)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
public HXFExcelExtractor(HSSFXMLWorkbook workbook) {
|
||||||
|
super(workbook);
|
||||||
|
this.workbook = workbook;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should sheet names be included? Default is true
|
||||||
|
*/
|
||||||
|
public void setIncludeSheetNames(boolean includeSheetNames) {
|
||||||
|
this.includeSheetNames = includeSheetNames;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* Should we return the formula itself, and not
|
||||||
|
* the result it produces? Default is false
|
||||||
|
*/
|
||||||
|
public void setFormulasNotResults(boolean formulasNotResults) {
|
||||||
|
this.formulasNotResults = formulasNotResults;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retreives the text contents of the file
|
||||||
|
*/
|
||||||
|
public String getText() {
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
|
||||||
|
CTSheet[] sheetRefs =
|
||||||
|
workbook._getHSSFXML().getSheetReferences().getSheetArray();
|
||||||
|
for(int i=0; i<sheetRefs.length; i++) {
|
||||||
|
try {
|
||||||
|
CTWorksheet sheet =
|
||||||
|
workbook._getHSSFXML().getSheet(sheetRefs[i]);
|
||||||
|
CTRow[] rows =
|
||||||
|
sheet.getSheetData().getRowArray();
|
||||||
|
|
||||||
|
if(i > 0) {
|
||||||
|
text.append("\n");
|
||||||
|
}
|
||||||
|
if(includeSheetNames) {
|
||||||
|
text.append(sheetRefs[i].getName() + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int j=0; j<rows.length; j++) {
|
||||||
|
CTCell[] cells = rows[j].getCArray();
|
||||||
|
for(int k=0; k<cells.length; k++) {
|
||||||
|
CTCell cell = cells[k];
|
||||||
|
if(k > 0) {
|
||||||
|
text.append("\t");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Is it a formula one?
|
||||||
|
if(cell.getF() != null) {
|
||||||
|
if(formulasNotResults) {
|
||||||
|
text.append(cell.getF().getStringValue());
|
||||||
|
} else {
|
||||||
|
text.append(cell.getV());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Probably just want the v value
|
||||||
|
text.append(cell.getV());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text.append("\n");
|
||||||
|
}
|
||||||
|
} catch(Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hssf.usermodel;
|
||||||
|
|
||||||
|
import org.apache.poi.POIXMLDocument;
|
||||||
|
import org.apache.poi.hssf.HSSFXML;
|
||||||
|
|
||||||
|
public class HSSFXMLWorkbook extends POIXMLDocument {
|
||||||
|
private HSSFXML hssfXML;
|
||||||
|
|
||||||
|
public HSSFXMLWorkbook(HSSFXML xml) {
|
||||||
|
super(xml);
|
||||||
|
this.hssfXML = xml;
|
||||||
|
}
|
||||||
|
|
||||||
|
public HSSFXML _getHSSFXML() {
|
||||||
|
return hssfXML;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,75 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hssf.extractor;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
import org.apache.poi.hssf.HSSFXML;
|
||||||
|
import org.apache.poi.hssf.usermodel.HSSFXMLWorkbook;
|
||||||
|
import org.apache.poi.hxf.HXFDocument;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for HXFExcelExtractor
|
||||||
|
*/
|
||||||
|
public class TestHXFExcelExtractor extends TestCase {
|
||||||
|
/**
|
||||||
|
* A very simple file
|
||||||
|
*/
|
||||||
|
private HSSFXML xmlA;
|
||||||
|
/**
|
||||||
|
* A fairly complex file
|
||||||
|
*/
|
||||||
|
private HSSFXML xmlB;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
File fileA = new File(
|
||||||
|
System.getProperty("HSSF.testdata.path") +
|
||||||
|
File.separator + "sample.xlsx"
|
||||||
|
);
|
||||||
|
File fileB = new File(
|
||||||
|
System.getProperty("HSSF.testdata.path") +
|
||||||
|
File.separator + "AverageTaxRates.xlsx"
|
||||||
|
);
|
||||||
|
|
||||||
|
xmlA = new HSSFXML(HXFDocument.openPackage(fileA));
|
||||||
|
xmlB = new HSSFXML(HXFDocument.openPackage(fileB));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get text out of the simple file
|
||||||
|
*/
|
||||||
|
public void testGetSimpleText() throws Exception {
|
||||||
|
new HXFExcelExtractor(xmlA.getPackage());
|
||||||
|
new HXFExcelExtractor(new HSSFXMLWorkbook(xmlA));
|
||||||
|
|
||||||
|
HXFExcelExtractor extractor =
|
||||||
|
new HXFExcelExtractor(xmlA.getPackage());
|
||||||
|
extractor.getText();
|
||||||
|
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(text.length() > 0);
|
||||||
|
System.err.println(text);
|
||||||
|
|
||||||
|
// Check sheet names
|
||||||
|
assertTrue(text.startsWith("Sheet1"));
|
||||||
|
assertTrue(text.endsWith("Sheet3\n"));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue