Patch from Shaun Kalley from bug #56023 - Allow XSSF event model to find + return comments, and use this for the event based .xlsx text extractor

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1613266 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2014-07-24 20:13:54 +00:00
parent f3dba52888
commit 62bd48af74
4 changed files with 252 additions and 23 deletions

View File

@ -16,13 +16,22 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.xssf.eventusermodel; package org.apache.poi.xssf.eventusermodel;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger; import org.apache.poi.util.POILogger;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTComment;
import org.xml.sax.Attributes; import org.xml.sax.Attributes;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.DefaultHandler;
@ -54,6 +63,15 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
*/ */
private StylesTable stylesTable; private StylesTable stylesTable;
/**
* Table with cell comments
*/
private CommentsTable commentsTable;
/**
* Read only access to the shared strings table, for looking
* up (most) string cell's contents
*/
private ReadOnlySharedStringsTable sharedStringsTable; private ReadOnlySharedStringsTable sharedStringsTable;
/** /**
@ -78,6 +96,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
private short formatIndex; private short formatIndex;
private String formatString; private String formatString;
private final DataFormatter formatter; private final DataFormatter formatter;
private int rowNum;
private String cellRef; private String cellRef;
private boolean formulasNotResults; private boolean formulasNotResults;
@ -86,6 +105,31 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
private StringBuffer formula = new StringBuffer(); private StringBuffer formula = new StringBuffer();
private StringBuffer headerFooter = new StringBuffer(); private StringBuffer headerFooter = new StringBuffer();
private Queue<CellReference> commentCellRefs;
/**
* Accepts objects needed while parsing.
*
* @param styles Table of styles
* @param strings Table of shared strings
*/
public XSSFSheetXMLHandler(
StylesTable styles,
CommentsTable comments,
ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter,
boolean formulasNotResults) {
this.stylesTable = styles;
this.commentsTable = comments;
this.sharedStringsTable = strings;
this.output = sheetContentsHandler;
this.formulasNotResults = formulasNotResults;
this.nextDataType = xssfDataType.NUMBER;
this.formatter = dataFormatter;
init();
}
/** /**
* Accepts objects needed while parsing. * Accepts objects needed while parsing.
* *
@ -98,13 +142,9 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
SheetContentsHandler sheetContentsHandler, SheetContentsHandler sheetContentsHandler,
DataFormatter dataFormatter, DataFormatter dataFormatter,
boolean formulasNotResults) { boolean formulasNotResults) {
this.stylesTable = styles; this(styles, null, strings, sheetContentsHandler, dataFormatter, formulasNotResults);
this.sharedStringsTable = strings;
this.output = sheetContentsHandler;
this.formulasNotResults = formulasNotResults;
this.nextDataType = xssfDataType.NUMBER;
this.formatter = dataFormatter;
} }
/** /**
* Accepts objects needed while parsing. * Accepts objects needed while parsing.
* *
@ -119,6 +159,16 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
this(styles, strings, sheetContentsHandler, new DataFormatter(), formulasNotResults); this(styles, strings, sheetContentsHandler, new DataFormatter(), formulasNotResults);
} }
private void init() {
if (commentsTable != null) {
commentCellRefs = new LinkedList<CellReference>();
List<CTComment> commentList = commentsTable.getCTComments().getCommentList().getCommentList();
for (CTComment comment : commentList) {
commentCellRefs.add(new CellReference(comment.getRef()));
}
}
}
private boolean isTextTag(String name) { private boolean isTextTag(String name) {
if("v".equals(name)) { if("v".equals(name)) {
// Easy, normal v text tag // Easy, normal v text tag
@ -190,7 +240,7 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
headerFooter.setLength(0); headerFooter.setLength(0);
} }
else if("row".equals(name)) { else if("row".equals(name)) {
int rowNum = Integer.parseInt(attributes.getValue("r")) - 1; rowNum = Integer.parseInt(attributes.getValue("r")) - 1;
output.startRow(rowNum); output.startRow(rowNum);
} }
// c => cell // c => cell
@ -304,14 +354,25 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
break; break;
} }
// Do we have a comment for this cell?
checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL);
XSSFComment comment = commentsTable != null ? commentsTable.findCellComment(cellRef) : null;
// Output // Output
output.cell(cellRef, thisStr); output.cell(cellRef, thisStr, comment);
} else if ("f".equals(name)) { } else if ("f".equals(name)) {
fIsOpen = false; fIsOpen = false;
} else if ("is".equals(name)) { } else if ("is".equals(name)) {
isIsOpen = false; isIsOpen = false;
} else if ("row".equals(name)) { } else if ("row".equals(name)) {
output.endRow(); // Handle any "missing" cells which had comments attached
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_ROW);
// Finish up the row
output.endRow(rowNum);
} else if ("sheetData".equals(name)) {
// Handle any "missing" cells which had comments attached
checkForEmptyCellComments(EmptyCellCommentsCheckType.END_OF_SHEET_DATA);
} }
else if("oddHeader".equals(name) || "evenHeader".equals(name) || else if("oddHeader".equals(name) || "evenHeader".equals(name) ||
"firstHeader".equals(name)) { "firstHeader".equals(name)) {
@ -343,6 +404,90 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
} }
} }
/**
* Do a check for, and output, comments in otherwise empty cells.
*/
private void checkForEmptyCellComments(EmptyCellCommentsCheckType type) {
if (commentCellRefs != null && !commentCellRefs.isEmpty()) {
// If we've reached the end of the sheet data, output any
// comments we haven't yet already handled
if (type == EmptyCellCommentsCheckType.END_OF_SHEET_DATA) {
while (!commentCellRefs.isEmpty()) {
outputEmptyCellComment(commentCellRefs.remove());
}
return;
}
// At the end of a row, handle any comments for "missing" rows before us
if (this.cellRef == null) {
if (type == EmptyCellCommentsCheckType.END_OF_ROW) {
while (!commentCellRefs.isEmpty()) {
if (commentCellRefs.peek().getRow() == rowNum) {
outputEmptyCellComment(commentCellRefs.remove());
} else {
return;
}
}
return;
} else {
throw new IllegalStateException("Cell ref should be null only if there are only empty cells in the row; rowNum: " + rowNum);
}
}
CellReference nextCommentCellRef;
do {
CellReference cellRef = new CellReference(this.cellRef);
CellReference peekCellRef = commentCellRefs.peek();
if (type == EmptyCellCommentsCheckType.CELL && cellRef.equals(peekCellRef)) {
// remove the comment cell ref from the list if we're about to handle it alongside the cell content
commentCellRefs.remove();
return;
} else {
// fill in any gaps if there are empty cells with comment mixed in with non-empty cells
int comparison = cellRefComparator.compare(peekCellRef, cellRef);
if (comparison > 0 && type == EmptyCellCommentsCheckType.END_OF_ROW && peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else if (comparison < 0 && type == EmptyCellCommentsCheckType.CELL && peekCellRef.getRow() <= rowNum) {
nextCommentCellRef = commentCellRefs.remove();
outputEmptyCellComment(nextCommentCellRef);
} else {
nextCommentCellRef = null;
}
}
} while (nextCommentCellRef != null && !commentCellRefs.isEmpty());
}
}
/**
* Output an empty-cell comment.
*/
private void outputEmptyCellComment(CellReference cellRef) {
String cellRefString = cellRef.formatAsString();
XSSFComment comment = commentsTable.findCellComment(cellRefString);
output.emptyCellComment(cellRefString, comment);
}
private enum EmptyCellCommentsCheckType {
CELL,
END_OF_ROW,
END_OF_SHEET_DATA
}
private static final Comparator<CellReference> cellRefComparator = new Comparator<CellReference>() {
@Override
public int compare(CellReference o1, CellReference o2) {
int result = compare(o1.getRow(), o2.getRow());
if (result == 0) {
result = compare(o1.getCol(), o2.getCol());
}
return result;
}
public int compare(int x, int y) {
return (x < y) ? -1 : ((x == y) ? 0 : 1);
}
};
/** /**
* You need to implement this to handle the results * You need to implement this to handle the results
* of the sheet parsing. * of the sheet parsing.
@ -351,9 +496,11 @@ public class XSSFSheetXMLHandler extends DefaultHandler {
/** A row with the (zero based) row number has started */ /** A row with the (zero based) row number has started */
public void startRow(int rowNum); public void startRow(int rowNum);
/** A row with the (zero based) row number has ended */ /** A row with the (zero based) row number has ended */
public void endRow(); public void endRow(int rowNum);
/** A cell, with the given formatted value, was encountered */ /** A cell, with the given formatted value, and possibly a comment, was encountered */
public void cell(String cellReference, String formattedValue); public void cell(String cellReference, String formattedValue, XSSFComment comment);
/** A comment for an otherwise-empty cell was encountered */
public void emptyCellComment(String cellReference, XSSFComment comment);
/** A header or footer has been encountered */ /** A header or footer has been encountered */
public void headerFooter(String text, boolean isHeader, String tagName); public void headerFooter(String text, boolean isHeader, String tagName);
} }

View File

@ -39,7 +39,9 @@ import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
@ -60,6 +62,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
private Locale locale; private Locale locale;
private boolean includeTextBoxes = true; private boolean includeTextBoxes = true;
private boolean includeSheetNames = true; private boolean includeSheetNames = true;
private boolean includeCellComments = false;
private boolean includeHeadersFooters = true; private boolean includeHeadersFooters = true;
private boolean formulasNotResults = false; private boolean formulasNotResults = false;
@ -112,11 +115,10 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
} }
/** /**
* Would control the inclusion of cell comments from the document, * Should cell comments be included? Default is false
* if we supported it
*/ */
public void setIncludeCellComments(boolean includeCellComments) { public void setIncludeCellComments(boolean includeCellComments) {
throw new IllegalStateException("Comment extraction not supported in streaming mode, please use XSSFExcelExtractor"); this.includeCellComments = includeCellComments;
} }
public void setLocale(Locale locale) { public void setLocale(Locale locale) {
@ -159,6 +161,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
public void processSheet( public void processSheet(
SheetContentsHandler sheetContentsExtractor, SheetContentsHandler sheetContentsExtractor,
StylesTable styles, StylesTable styles,
CommentsTable comments,
ReadOnlySharedStringsTable strings, ReadOnlySharedStringsTable strings,
InputStream sheetInputStream) InputStream sheetInputStream)
throws IOException, SAXException { throws IOException, SAXException {
@ -176,7 +179,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
SAXParser saxParser = saxFactory.newSAXParser(); SAXParser saxParser = saxFactory.newSAXParser();
XMLReader sheetParser = saxParser.getXMLReader(); XMLReader sheetParser = saxParser.getXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler( ContentHandler handler = new XSSFSheetXMLHandler(
styles, strings, sheetContentsExtractor, formatter, formulasNotResults); styles, comments, strings, sheetContentsExtractor, formatter, formulasNotResults);
sheetParser.setContentHandler(handler); sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource); sheetParser.parse(sheetSource);
} catch(ParserConfigurationException e) { } catch(ParserConfigurationException e) {
@ -203,7 +206,8 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
text.append(iter.getSheetName()); text.append(iter.getSheetName());
text.append('\n'); text.append('\n');
} }
processSheet(sheetExtractor, styles, strings, stream); CommentsTable comments = includeCellComments ? iter.getSheetComments() : null;
processSheet(sheetExtractor, styles, comments, strings, stream);
if (includeHeadersFooters) { if (includeHeadersFooters) {
sheetExtractor.appendHeaderText(text); sheetExtractor.appendHeaderText(text);
} }
@ -268,17 +272,32 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
firstCellOfRow = true; firstCellOfRow = true;
} }
public void endRow() { public void endRow(int rowNum) {
output.append('\n'); output.append('\n');
} }
public void cell(String cellRef, String formattedValue) { public void cell(String cellRef, String formattedValue, XSSFComment comment) {
if(firstCellOfRow) { if(firstCellOfRow) {
firstCellOfRow = false; firstCellOfRow = false;
} else { } else {
output.append('\t'); output.append('\t');
} }
output.append(formattedValue); if (formattedValue != null) {
output.append(formattedValue);
}
if (includeCellComments && comment != null) {
String commentText = comment.getString().getString().replace('\n', ' ');
output.append(formattedValue != null ? " Comment by " : "Comment by ");
if (commentText.startsWith(comment.getAuthor() + ": ")) {
output.append(commentText);
} else {
output.append(comment.getAuthor()).append(": ").append(commentText);
}
}
}
public void emptyCellComment(String cellRef, XSSFComment comment) {
cell(cellRef, null, comment);
} }
public void headerFooter(String text, boolean isHeader, String tagName) { public void headerFooter(String text, boolean isHeader, String tagName) {
@ -287,7 +306,6 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
} }
} }
/** /**
* Append the text for the named header or footer if found. * Append the text for the named header or footer if found.
*/ */

View File

@ -20,13 +20,13 @@ package org.apache.poi.xssf.extractor;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import junit.framework.TestCase;
import org.apache.poi.POITextExtractor; import org.apache.poi.POITextExtractor;
import org.apache.poi.hssf.HSSFTestDataSamples; import org.apache.poi.hssf.HSSFTestDataSamples;
import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.xssf.XSSFTestDataSamples; import org.apache.poi.xssf.XSSFTestDataSamples;
import junit.framework.TestCase;
/** /**
* Tests for {@link XSSFEventBasedExcelExtractor} * Tests for {@link XSSFEventBasedExcelExtractor}
*/ */
@ -240,4 +240,68 @@ public class TestXSSFEventBasedExcelExtractor extends TestCase {
fixture.setIncludeHeadersFooters(false); fixture.setIncludeHeadersFooters(false);
assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText()); assertEquals(expectedOutputWithoutHeadersAndFooters, fixture.getText());
} }
/**
* Test that XSSFEventBasedExcelExtractor outputs comments when specified.
* The output will contain two improvements over the output from
* XSSFExcelExtractor in that (1) comments from empty cells will be
* outputted, and (2) the author will not be outputted twice.
* <p>
* This test will need to be modified if these improvements are ported to
* XSSFExcelExtractor.
*/
public void testCommentsComparedToNonEventBasedExtractor()
throws Exception {
String expectedOutputWithoutComments =
"Sheet1\n" +
"\n" +
"abc\n" +
"\n" +
"123\n" +
"\n" +
"\n" +
"\n";
String nonEventBasedExtractorOutputWithComments =
"Sheet1\n" +
"\n" +
"abc Comment by Shaun Kalley: Shaun Kalley: Comment A2\n" +
"\n" +
"123 Comment by Shaun Kalley: Shaun Kalley: Comment B4\n" +
"\n" +
"\n" +
"\n";
String eventBasedExtractorOutputWithComments =
"Sheet1\n" +
"Comment by Shaun Kalley: Comment A1\tComment by Shaun Kalley: Comment B1\n" +
"abc Comment by Shaun Kalley: Comment A2\tComment by Shaun Kalley: Comment B2\n" +
"Comment by Shaun Kalley: Comment A3\tComment by Shaun Kalley: Comment B3\n" +
"Comment by Shaun Kalley: Comment A4\t123 Comment by Shaun Kalley: Comment B4\n" +
"Comment by Shaun Kalley: Comment A5\tComment by Shaun Kalley: Comment B5\n" +
"Comment by Shaun Kalley: Comment A7\tComment by Shaun Kalley: Comment B7\n" +
"Comment by Shaun Kalley: Comment A8\tComment by Shaun Kalley: Comment B8\n";
XSSFExcelExtractor extractor = new XSSFExcelExtractor(
XSSFTestDataSamples.openSampleWorkbook("commentTest.xlsx"));
try {
assertEquals(expectedOutputWithoutComments, extractor.getText());
extractor.setIncludeCellComments(true);
assertEquals(nonEventBasedExtractorOutputWithComments, extractor.getText());
} finally {
extractor.close();
}
XSSFEventBasedExcelExtractor fixture =
new XSSFEventBasedExcelExtractor(
XSSFTestDataSamples.openSamplePackage("commentTest.xlsx"));
try {
assertEquals(expectedOutputWithoutComments, fixture.getText());
fixture.setIncludeCellComments(true);
assertEquals(eventBasedExtractorOutputWithComments, fixture.getText());
} finally {
fixture.close();
}
}
} }

Binary file not shown.