make xssf streaming code more extensible

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1836795 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
PJ Fanning 2018-07-27 09:19:58 +00:00
parent 3014d51f51
commit 543c2e3ca9
2 changed files with 85 additions and 88 deletions

View File

@ -59,10 +59,10 @@ import org.xml.sax.helpers.DefaultHandler;
/** /**
* This class makes it easy to get at individual parts * This class makes it easy to get at individual parts
* of an OOXML .xlsx file, suitable for low memory sax * of an OOXML .xlsx file, suitable for low memory sax
* parsing or similar. * parsing or similar.
* It makes up the core part of the EventUserModel support * It makes up the core part of the EventUserModel support
* for XSSF. * for XSSF.
*/ */
public class XSSFReader { public class XSSFReader {
@ -90,7 +90,7 @@ public class XSSFReader {
// strict OOXML likely not fully supported, see #57699 // strict OOXML likely not fully supported, see #57699
// this code is similar to POIXMLDocumentPart.getPartFromOPCPackage(), but I could not combine it // this code is similar to POIXMLDocumentPart.getPartFromOPCPackage(), but I could not combine it
// easily due to different return values // easily due to different return values
if(coreDocRelationship == null) { if (coreDocRelationship == null) {
if (this.pkg.getRelationshipsByType( if (this.pkg.getRelationshipsByType(
PackageRelationshipTypes.STRICT_CORE_DOCUMENT).getRelationship(0) != null) { PackageRelationshipTypes.STRICT_CORE_DOCUMENT).getRelationship(0) != null) {
throw new POIXMLException("Strict OOXML isn't currently supported, please see bug #57699"); throw new POIXMLException("Strict OOXML isn't currently supported, please see bug #57699");
@ -106,27 +106,27 @@ public class XSSFReader {
/** /**
* Opens up the Shared Strings Table, parses it, and * Opens up the Shared Strings Table, parses it, and
* returns a handy object for working with * returns a handy object for working with
* shared strings. * shared strings.
*/ */
public SharedStringsTable getSharedStringsTable() throws IOException, InvalidFormatException { public SharedStringsTable getSharedStringsTable() throws IOException, InvalidFormatException {
ArrayList<PackagePart> parts = pkg.getPartsByContentType( XSSFRelation.SHARED_STRINGS.getContentType()); ArrayList<PackagePart> parts = pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
return parts.size() == 0 ? null : new SharedStringsTable(parts.get(0)); return parts.size() == 0 ? null : new SharedStringsTable(parts.get(0));
} }
/** /**
* Opens up the Styles Table, parses it, and * Opens up the Styles Table, parses it, and
* returns a handy object for working with cell styles * returns a handy object for working with cell styles
*/ */
public StylesTable getStylesTable() throws IOException, InvalidFormatException { public StylesTable getStylesTable() throws IOException, InvalidFormatException {
ArrayList<PackagePart> parts = pkg.getPartsByContentType( XSSFRelation.STYLES.getContentType()); ArrayList<PackagePart> parts = pkg.getPartsByContentType(XSSFRelation.STYLES.getContentType());
if(parts.size() == 0) return null; if (parts.size() == 0) return null;
// Create the Styles Table, and associate the Themes if present // Create the Styles Table, and associate the Themes if present
StylesTable styles = new StylesTable(parts.get(0)); StylesTable styles = new StylesTable(parts.get(0));
parts = pkg.getPartsByContentType( XSSFRelation.THEME.getContentType()); parts = pkg.getPartsByContentType(XSSFRelation.THEME.getContentType());
if(parts.size() != 0) { if (parts.size() != 0) {
styles.setTheme(new ThemesTable(parts.get(0))); styles.setTheme(new ThemesTable(parts.get(0)));
} }
return styles; return styles;
} }
@ -134,7 +134,7 @@ public class XSSFReader {
/** /**
* Returns an InputStream to read the contents of the * Returns an InputStream to read the contents of the
* shared strings table. * shared strings table.
*/ */
public InputStream getSharedStringsData() throws IOException, InvalidFormatException { public InputStream getSharedStringsData() throws IOException, InvalidFormatException {
return XSSFRelation.SHARED_STRINGS.getContents(workbookPart); return XSSFRelation.SHARED_STRINGS.getContents(workbookPart);
@ -142,7 +142,7 @@ public class XSSFReader {
/** /**
* Returns an InputStream to read the contents of the * Returns an InputStream to read the contents of the
* styles table. * styles table.
*/ */
public InputStream getStylesData() throws IOException, InvalidFormatException { public InputStream getStylesData() throws IOException, InvalidFormatException {
return XSSFRelation.STYLES.getContents(workbookPart); return XSSFRelation.STYLES.getContents(workbookPart);
@ -150,7 +150,7 @@ public class XSSFReader {
/** /**
* Returns an InputStream to read the contents of the * Returns an InputStream to read the contents of the
* themes table. * themes table.
*/ */
public InputStream getThemesData() throws IOException, InvalidFormatException { public InputStream getThemesData() throws IOException, InvalidFormatException {
return XSSFRelation.THEME.getContents(workbookPart); return XSSFRelation.THEME.getContents(workbookPart);
@ -158,8 +158,8 @@ public class XSSFReader {
/** /**
* Returns an InputStream to read the contents of the * Returns an InputStream to read the contents of the
* main Workbook, which contains key overall data for * main Workbook, which contains key overall data for
* the file, including sheet definitions. * the file, including sheet definitions.
*/ */
public InputStream getWorkbookData() throws IOException, InvalidFormatException { public InputStream getWorkbookData() throws IOException, InvalidFormatException {
return workbookPart.getInputStream(); return workbookPart.getInputStream();
@ -167,18 +167,19 @@ public class XSSFReader {
/** /**
* Returns an InputStream to read the contents of the * Returns an InputStream to read the contents of the
* specified Sheet. * specified Sheet.
*
* @param relId The relationId of the sheet, from a r:id on the workbook * @param relId The relationId of the sheet, from a r:id on the workbook
*/ */
public InputStream getSheet(String relId) throws IOException, InvalidFormatException { public InputStream getSheet(String relId) throws IOException, InvalidFormatException {
PackageRelationship rel = workbookPart.getRelationship(relId); PackageRelationship rel = workbookPart.getRelationship(relId);
if(rel == null) { if (rel == null) {
throw new IllegalArgumentException("No Sheet found with r:id " + relId); throw new IllegalArgumentException("No Sheet found with r:id " + relId);
} }
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
PackagePart sheet = pkg.getPart(relName); PackagePart sheet = pkg.getPart(relName);
if(sheet == null) { if (sheet == null) {
throw new IllegalArgumentException("No data found for Sheet with r:id " + relId); throw new IllegalArgumentException("No data found for Sheet with r:id " + relId);
} }
return sheet.getInputStream(); return sheet.getInputStream();
@ -186,10 +187,10 @@ public class XSSFReader {
/** /**
* Returns an Iterator which will let you get at all the * Returns an Iterator which will let you get at all the
* different Sheets in turn. * different Sheets in turn.
* Each sheet's InputStream is only opened when fetched * Each sheet's InputStream is only opened when fetched
* from the Iterator. It's up to you to close the * from the Iterator. It's up to you to close the
* InputStreams when done with each one. * InputStreams when done with each one.
*/ */
public Iterator<InputStream> getSheetsData() throws IOException, InvalidFormatException { public Iterator<InputStream> getSheetsData() throws IOException, InvalidFormatException {
return new SheetIterator(workbookPart); return new SheetIterator(workbookPart);
@ -201,7 +202,7 @@ public class XSSFReader {
public static class SheetIterator implements Iterator<InputStream> { public static class SheetIterator implements Iterator<InputStream> {
/** /**
* Maps relId and the corresponding PackagePart * Maps relId and the corresponding PackagePart
*/ */
private final Map<String, PackagePart> sheetMap; private final Map<String, PackagePart> sheetMap;
@ -232,7 +233,7 @@ public class XSSFReader {
sheetMap = new HashMap<>(); sheetMap = new HashMap<>();
OPCPackage pkg = wb.getPackage(); OPCPackage pkg = wb.getPackage();
Set<String> worksheetRels = getSheetRelationships(); Set<String> worksheetRels = getSheetRelationships();
for(PackageRelationship rel : wb.getRelationships()){ for (PackageRelationship rel : wb.getRelationships()) {
String relType = rel.getRelationshipType(); String relType = rel.getRelationshipType();
if (worksheetRels.contains(relType)) { if (worksheetRels.contains(relType)) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
@ -242,7 +243,7 @@ public class XSSFReader {
//step 2. Read array of CTSheet elements, wrap it in a LinkedList //step 2. Read array of CTSheet elements, wrap it in a LinkedList
//and construct an iterator //and construct an iterator
sheetIterator = createSheetIteratorFromWB(wb); sheetIterator = createSheetIteratorFromWB(wb);
} catch (InvalidFormatException e){ } catch (InvalidFormatException e) {
throw new POIXMLException(e); throw new POIXMLException(e);
} }
} }
@ -311,7 +312,7 @@ public class XSSFReader {
try { try {
PackagePart sheetPkg = sheetMap.get(sheetId); PackagePart sheetPkg = sheetMap.get(sheetId);
return sheetPkg.getInputStream(); return sheetPkg.getInputStream();
} catch(IOException e) { } catch (IOException e) {
throw new POIXMLException(e); throw new POIXMLException(e);
} }
} }
@ -324,67 +325,63 @@ public class XSSFReader {
public String getSheetName() { public String getSheetName() {
return xssfSheetRef.getName(); return xssfSheetRef.getName();
} }
/** /**
* Returns the comments associated with this sheet, * Returns the comments associated with this sheet,
* or null if there aren't any * or null if there aren't any
*/ */
public CommentsTable getSheetComments() { public CommentsTable getSheetComments() {
PackagePart sheetPkg = getSheetPart(); PackagePart sheetPkg = getSheetPart();
// Do we have a comments relationship? (Only ever one if so) // Do we have a comments relationship? (Only ever one if so)
try { try {
PackageRelationshipCollection commentsList = PackageRelationshipCollection commentsList =
sheetPkg.getRelationshipsByType(XSSFRelation.SHEET_COMMENTS.getRelation()); sheetPkg.getRelationshipsByType(XSSFRelation.SHEET_COMMENTS.getRelation());
if(commentsList.size() > 0) { if (commentsList.size() > 0) {
PackageRelationship comments = commentsList.getRelationship(0); PackageRelationship comments = commentsList.getRelationship(0);
PackagePartName commentsName = PackagingURIHelper.createPartName(comments.getTargetURI()); PackagePartName commentsName = PackagingURIHelper.createPartName(comments.getTargetURI());
PackagePart commentsPart = sheetPkg.getPackage().getPart(commentsName); PackagePart commentsPart = sheetPkg.getPackage().getPart(commentsName);
return new CommentsTable(commentsPart); return new CommentsTable(commentsPart);
} }
} catch (InvalidFormatException e) { } catch (InvalidFormatException|IOException e) {
return null; LOGGER.log(POILogger.WARN, e);
} catch (IOException e) { return null;
return null; }
} return null;
return null;
} }
/** /**
* Returns the shapes associated with this sheet, * Returns the shapes associated with this sheet,
* an empty list or null if there is an exception * an empty list or null if there is an exception
*/ */
public List<XSSFShape> getShapes() { public List<XSSFShape> getShapes() {
PackagePart sheetPkg = getSheetPart(); PackagePart sheetPkg = getSheetPart();
List<XSSFShape> shapes= new LinkedList<>(); List<XSSFShape> shapes = new LinkedList<>();
// Do we have a comments relationship? (Only ever one if so) // Do we have a comments relationship? (Only ever one if so)
try { try {
PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation()); PackageRelationshipCollection drawingsList = sheetPkg.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation());
for (int i = 0; i < drawingsList.size(); i++){ for (int i = 0; i < drawingsList.size(); i++) {
PackageRelationship drawings = drawingsList.getRelationship(i); PackageRelationship drawings = drawingsList.getRelationship(i);
PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI()); PackagePartName drawingsName = PackagingURIHelper.createPartName(drawings.getTargetURI());
PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName); PackagePart drawingsPart = sheetPkg.getPackage().getPart(drawingsName);
if (drawingsPart == null) { if (drawingsPart == null) {
//parts can go missing; Excel ignores them silently -- TIKA-2134 //parts can go missing; Excel ignores them silently -- TIKA-2134
LOGGER.log(POILogger.WARN, "Missing drawing: "+drawingsName +". Skipping it."); LOGGER.log(POILogger.WARN, "Missing drawing: " + drawingsName + ". Skipping it.");
continue; continue;
} }
XSSFDrawing drawing = new XSSFDrawing(drawingsPart); XSSFDrawing drawing = new XSSFDrawing(drawingsPart);
shapes.addAll(drawing.getShapes()); shapes.addAll(drawing.getShapes());
} }
} catch (XmlException e){ } catch (XmlException|InvalidFormatException|IOException e) {
return null; LOGGER.log(POILogger.WARN, e);
} catch (InvalidFormatException e) { return null;
return null; }
} catch (IOException e) { return shapes;
return null;
}
return shapes;
} }
public PackagePart getSheetPart() { public PackagePart getSheetPart() {
String sheetId = xssfSheetRef.getId(); String sheetId = xssfSheetRef.getId();
return sheetMap.get(sheetId); return sheetMap.get(sheetId);
} }
/** /**

View File

@ -58,16 +58,16 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class); private static final POILogger LOGGER = POILogFactory.getLogger(XSSFEventBasedExcelExtractor.class);
private OPCPackage container; protected OPCPackage container;
private POIXMLProperties properties; protected POIXMLProperties properties;
private Locale locale; protected Locale locale;
private boolean includeTextBoxes = true; protected boolean includeTextBoxes = true;
private boolean includeSheetNames = true; protected boolean includeSheetNames = true;
private boolean includeCellComments; protected boolean includeCellComments;
private boolean includeHeadersFooters = true; protected boolean includeHeadersFooters = true;
private boolean formulasNotResults; protected boolean formulasNotResults;
private boolean concatenatePhoneticRuns = true; protected boolean concatenatePhoneticRuns = true;
public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException { public XSSFEventBasedExcelExtractor(String path) throws XmlException, OpenXML4JException, IOException {
this(OPCPackage.open(path)); this(OPCPackage.open(path));
@ -254,7 +254,7 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
} }
} }
protected SharedStrings createSharedStringsTable(OPCPackage container, boolean concatenatePhoneticRuns) protected SharedStrings createSharedStringsTable(XSSFReader xssfReader, OPCPackage container)
throws IOException, SAXException { throws IOException, SAXException {
return new ReadOnlySharedStringsTable(container, concatenatePhoneticRuns); return new ReadOnlySharedStringsTable(container, concatenatePhoneticRuns);
} }
@ -264,8 +264,8 @@ public class XSSFEventBasedExcelExtractor extends POIXMLTextExtractor
*/ */
public String getText() { public String getText() {
try { try {
SharedStrings strings = createSharedStringsTable(container, concatenatePhoneticRuns);
XSSFReader xssfReader = new XSSFReader(container); XSSFReader xssfReader = new XSSFReader(container);
SharedStrings strings = createSharedStringsTable(xssfReader, container);
StylesTable styles = xssfReader.getStylesTable(); StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
StringBuilder text = new StringBuilder(64); StringBuilder text = new StringBuilder(64);