From 4c253baee32b650f78f8cb6af26795537d22fc81 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Fri, 19 Nov 2010 18:14:06 +0000 Subject: [PATCH] Some refactoring of XSSFEventBasedExcelExtractor, to make it easier for you to have control over outputting the cell contents, inspired by bug #50076 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1036968 13f79535-47bb-0310-9956-ffa450edef68 --- src/documentation/content/xdocs/status.xml | 1 + .../eventusermodel/XSSFSheetXMLHandler.java | 253 +++++++++++++++++ .../XSSFEventBasedExcelExtractor.java | 257 +++--------------- 3 files changed, 287 insertions(+), 224 deletions(-) create mode 100644 src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index f401b0308c..1172555e45 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,6 +34,7 @@ + 50076 - Refactor XSSFEventBasedExcelExtractor to make it easier for you to have control over outputting the cell contents 50258 - avoid corruption of XSSFWorkbook after applying XSSFRichTextRun#applyFont 50154 - Allow white spaces and unicode in OPC relationship targets 50113 - Remove cell from Calculation Chain after setting cell type to blank diff --git a/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java new file mode 100644 index 0000000000..873d84eea9 --- /dev/null +++ b/src/ooxml/java/org/apache/poi/xssf/eventusermodel/XSSFSheetXMLHandler.java @@ -0,0 +1,253 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.xssf.eventusermodel; + +import org.apache.poi.ss.usermodel.BuiltinFormats; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.xssf.model.StylesTable; +import org.apache.poi.xssf.usermodel.XSSFCellStyle; +import org.apache.poi.xssf.usermodel.XSSFRichTextString; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * This class handles the processing of a sheet#.xml + * sheet part of a XSSF .xlsx file, and generates + * row and cell events for it. + */ +public class XSSFSheetXMLHandler extends DefaultHandler { + /** + * These are the different kinds of cells we support. + * We keep track of the current one between + * the start and end. + */ + enum xssfDataType { + BOOLEAN, + ERROR, + FORMULA, + INLINE_STRING, + SST_STRING, + NUMBER, + } + + /** + * Table with the styles used for formatting + */ + private StylesTable stylesTable; + + private ReadOnlySharedStringsTable sharedStringsTable; + + /** + * Where our text is going + */ + private final SheetContentsHandler output; + + // Set when V start element is seen + private boolean vIsOpen; + // Set when F start element is seen + private boolean fIsOpen; + + // Set when cell start element is seen; + // used when cell close element is seen. + private xssfDataType nextDataType; + + // Used to format numeric cell values. + private short formatIndex; + private String formatString; + private final DataFormatter formatter; + private String cellRef; + private boolean formulasNotResults; + + // Gathers characters as they are seen. + private StringBuffer value = new StringBuffer(); + private StringBuffer formula = new StringBuffer(); + + /** + * Accepts objects needed while parsing. + * + * @param styles Table of styles + * @param strings Table of shared strings + * @param cols Minimum number of columns to show + * @param target Sink for output + */ + public XSSFSheetXMLHandler( + StylesTable styles, + ReadOnlySharedStringsTable strings, + SheetContentsHandler sheetContentsHandler, + boolean formulasNotResults) { + this.stylesTable = styles; + this.sharedStringsTable = strings; + this.output = sheetContentsHandler; + this.formulasNotResults = formulasNotResults; + this.nextDataType = xssfDataType.NUMBER; + this.formatter = new DataFormatter(); + } + + public void startElement(String uri, String localName, String name, + Attributes attributes) throws SAXException { + + if ("inlineStr".equals(name) || "v".equals(name)) { + vIsOpen = true; + // Clear contents cache + value.setLength(0); + } else if ("f".equals(name)) { + // Clear contents cache + formula.setLength(0); + + // Mark us as being a formula if not already + if(nextDataType == xssfDataType.NUMBER) { + nextDataType = xssfDataType.FORMULA; + } + + // Decide where to get the formula string from + String type = attributes.getValue("t"); + if(type != null && type.equals("shared")) { + System.err.println("Warning - shared formulas not yet supported!"); + } else { + fIsOpen = true; + } + } + else if("row".equals(name)) { + int rowNum = Integer.parseInt(attributes.getValue("r")) - 1; + output.startRow(rowNum); + } + // c => cell + else if ("c".equals(name)) { + // Set up defaults. + this.nextDataType = xssfDataType.NUMBER; + this.formatIndex = -1; + this.formatString = null; + cellRef = attributes.getValue("r"); + String cellType = attributes.getValue("t"); + String cellStyleStr = attributes.getValue("s"); + if ("b".equals(cellType)) + nextDataType = xssfDataType.BOOLEAN; + else if ("e".equals(cellType)) + nextDataType = xssfDataType.ERROR; + else if ("inlineStr".equals(cellType)) + nextDataType = xssfDataType.INLINE_STRING; + else if ("s".equals(cellType)) + nextDataType = xssfDataType.SST_STRING; + else if ("str".equals(cellType)) + nextDataType = xssfDataType.FORMULA; + else if (cellStyleStr != null) { + // Number, but almost certainly with a special style or format + int styleIndex = Integer.parseInt(cellStyleStr); + XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); + this.formatIndex = style.getDataFormat(); + this.formatString = style.getDataFormatString(); + if (this.formatString == null) + this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); + } + } + } + + public void endElement(String uri, String localName, String name) + throws SAXException { + String thisStr = null; + + // v => contents of a cell + if ("v".equals(name)) { + vIsOpen = false; + + // Process the value contents as required, now we have it all + switch (nextDataType) { + case BOOLEAN: + char first = value.charAt(0); + thisStr = first == '0' ? "FALSE" : "TRUE"; + break; + + case ERROR: + thisStr = "ERROR:" + value.toString(); + break; + + case FORMULA: + if(formulasNotResults) { + thisStr = formula.toString(); + } else { + thisStr = value.toString(); + } + break; + + case INLINE_STRING: + // TODO: have seen an example of this, so it's untested. + XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); + thisStr = rtsi.toString(); + break; + + case SST_STRING: + String sstIndex = value.toString(); + try { + int idx = Integer.parseInt(sstIndex); + XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); + thisStr = rtss.toString(); + } + catch (NumberFormatException ex) { + System.err.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); + } + break; + + case NUMBER: + String n = value.toString(); + if (this.formatString != null) + thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); + else + thisStr = n; + break; + + default: + thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; + break; + } + + // Output + output.cell(cellRef, thisStr); + } else if ("f".equals(name)) { + fIsOpen = false; + } else if ("row".equals(name)) { + output.endRow(); + } + } + + /** + * Captures characters only if a suitable element is open. + * Originally was just "v"; extended for inlineStr also. + */ + public void characters(char[] ch, int start, int length) + throws SAXException { + if (vIsOpen) { + value.append(ch, start, length); + } + if (fIsOpen) { + formula.append(ch, start, length); + } + } + + /** + * You need to implement this to handle the results + * of the sheet parsing. + */ + public interface SheetContentsHandler { + /** A row with the (zero based) row number has started */ + public void startRow(int rowNum); + /** A row with the (zero based) row number has ended */ + public void endRow(); + /** A cell, with the given formatted value, was encountered */ + public void cell(String cellReference, String formattedValue); + } +} diff --git a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java index 9be9711894..c765ef7925 100644 --- a/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java +++ b/src/ooxml/java/org/apache/poi/xssf/extractor/XSSFEventBasedExcelExtractor.java @@ -26,20 +26,16 @@ import javax.xml.parsers.SAXParserFactory; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; -import org.apache.poi.ss.usermodel.BuiltinFormats; -import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.model.StylesTable; -import org.apache.poi.xssf.usermodel.XSSFCellStyle; -import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.apache.xmlbeans.XmlException; -import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; /** * Implementation of a text extractor from OOXML Excel @@ -50,20 +46,6 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { private boolean includeSheetNames = true; private boolean formulasNotResults = false; - /** - * These are the different kinds of cells we support. - * We keep track of the current one between - * the start and end. - */ - enum xssfDataType { - BOOLEAN, - ERROR, - FORMULA, - INLINE_STRING, - SST_STRING, - NUMBER, - } - public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { this(OPCPackage.open(path)); } @@ -97,212 +79,11 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { this.formulasNotResults = formulasNotResults; } - - /** - * Handler for sheets. Processes each row and cell, - * formatting Cells as best as it can. - */ - class MyXSSFSheetHandler extends DefaultHandler { - /** - * Table with the styles used for formatting - */ - private StylesTable stylesTable; - - private ReadOnlySharedStringsTable sharedStringsTable; - - /** - * Where our text is going - */ - private final StringBuffer output; - - // Set when V start element is seen - private boolean vIsOpen; - // Set when F start element is seen - private boolean fIsOpen; - - // Set when cell start element is seen; - // used when cell close element is seen. - private xssfDataType nextDataType; - - // Used to format numeric cell values. - private short formatIndex; - private String formatString; - private final DataFormatter formatter; - - // Gathers characters as they are seen. - private StringBuffer value = new StringBuffer(); - private StringBuffer formula = new StringBuffer(); - private boolean firstCellOfRow = true; - - /** - * Accepts objects needed while parsing. - * - * @param styles Table of styles - * @param strings Table of shared strings - * @param cols Minimum number of columns to show - * @param target Sink for output - */ - public MyXSSFSheetHandler( - StylesTable styles, - ReadOnlySharedStringsTable strings, - StringBuffer output) { - this.stylesTable = styles; - this.sharedStringsTable = strings; - this.output = output; - this.nextDataType = xssfDataType.NUMBER; - this.formatter = new DataFormatter(); - } - - public void startElement(String uri, String localName, String name, - Attributes attributes) throws SAXException { - - if ("inlineStr".equals(name) || "v".equals(name)) { - vIsOpen = true; - // Clear contents cache - value.setLength(0); - } else if ("f".equals(name)) { - // Clear contents cache - formula.setLength(0); - - // Mark us as being a formula if not already - if(nextDataType == xssfDataType.NUMBER) { - nextDataType = xssfDataType.FORMULA; - } - - // Decide where to get the formula string from - String type = attributes.getValue("t"); - if(type != null && type.equals("shared")) { - System.err.println("Warning - shared formulas not yet supported!"); - } else { - fIsOpen = true; - } - } - else if("row".equals(name)) { - firstCellOfRow = true; - } - // c => cell - else if ("c".equals(name)) { - // Set up defaults. - this.nextDataType = xssfDataType.NUMBER; - this.formatIndex = -1; - this.formatString = null; - String cellType = attributes.getValue("t"); - String cellStyleStr = attributes.getValue("s"); - if ("b".equals(cellType)) - nextDataType = xssfDataType.BOOLEAN; - else if ("e".equals(cellType)) - nextDataType = xssfDataType.ERROR; - else if ("inlineStr".equals(cellType)) - nextDataType = xssfDataType.INLINE_STRING; - else if ("s".equals(cellType)) - nextDataType = xssfDataType.SST_STRING; - else if ("str".equals(cellType)) - nextDataType = xssfDataType.FORMULA; - else if (cellStyleStr != null) { - // Number, but almost certainly with a special style or format - int styleIndex = Integer.parseInt(cellStyleStr); - XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); - this.formatIndex = style.getDataFormat(); - this.formatString = style.getDataFormatString(); - if (this.formatString == null) - this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); - } - } - } - - public void endElement(String uri, String localName, String name) - throws SAXException { - String thisStr = null; - - // v => contents of a cell - if ("v".equals(name)) { - vIsOpen = false; - - // Process the value contents as required, now we have it all - switch (nextDataType) { - case BOOLEAN: - char first = value.charAt(0); - thisStr = first == '0' ? "FALSE" : "TRUE"; - break; - - case ERROR: - thisStr = "ERROR:" + value.toString(); - break; - - case FORMULA: - if(formulasNotResults) { - thisStr = formula.toString(); - } else { - thisStr = value.toString(); - } - break; - - case INLINE_STRING: - // TODO: have seen an example of this, so it's untested. - XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); - thisStr = rtsi.toString(); - break; - - case SST_STRING: - String sstIndex = value.toString(); - try { - int idx = Integer.parseInt(sstIndex); - XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); - thisStr = rtss.toString(); - } - catch (NumberFormatException ex) { - System.err.println("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); - } - break; - - case NUMBER: - String n = value.toString(); - if (this.formatString != null) - thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); - else - thisStr = n; - break; - - default: - thisStr = "(TODO: Unexpected type: " + nextDataType + ")"; - break; - } - - // Output - if(!firstCellOfRow) { - output.append('\t'); - } - firstCellOfRow = false; - - output.append(thisStr); - } else if ("f".equals(name)) { - fIsOpen = false; - } else if ("row".equals(name)) { - // Finish the line - output.append('\n'); - } - } - - /** - * Captures characters only if a suitable element is open. - * Originally was just "v"; extended for inlineStr also. - */ - public void characters(char[] ch, int start, int length) - throws SAXException { - if (vIsOpen) { - value.append(ch, start, length); - } - if (fIsOpen) { - formula.append(ch, start, length); - } - } - } - /** * Processes the given sheet */ public void processSheet( - StringBuffer output, + SheetTextExtractor sheetExtractor, StylesTable styles, ReadOnlySharedStringsTable strings, InputStream sheetInputStream) @@ -313,7 +94,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { try { SAXParser saxParser = saxFactory.newSAXParser(); XMLReader sheetParser = saxParser.getXMLReader(); - ContentHandler handler = new MyXSSFSheetHandler(styles, strings, output); + ContentHandler handler = new XSSFSheetXMLHandler(styles, strings, sheetExtractor, formulasNotResults); sheetParser.setContentHandler(handler); sheetParser.parse(sheetSource); } catch(ParserConfigurationException e) { @@ -332,13 +113,15 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); StringBuffer text = new StringBuffer(); + SheetTextExtractor sheetExtractor = new SheetTextExtractor(text); + while (iter.hasNext()) { InputStream stream = iter.next(); if(includeSheetNames) { text.append(iter.getSheetName()); text.append('\n'); } - processSheet(text, styles, strings, stream); + processSheet(sheetExtractor, styles, strings, stream); stream.close(); } @@ -354,4 +137,30 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor { return null; } } + + protected class SheetTextExtractor implements SheetContentsHandler { + private final StringBuffer output; + private boolean firstCellOfRow = true; + + protected SheetTextExtractor(StringBuffer output) { + this.output = output; + } + + public void startRow(int rowNum) { + firstCellOfRow = true; + } + + public void endRow() { + output.append('\n'); + } + + public void cell(String cellRef, String formattedValue) { + if(firstCellOfRow) { + firstCellOfRow = false; + } else { + output.append('\t'); + } + output.append(formattedValue); + } + } }