mirror of https://github.com/apache/poi.git
XSLF: text extraction from tables
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@897875 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
14b87c4232
commit
b1c8c26708
|
@ -16,28 +16,18 @@
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.xslf.extractor;
|
package org.apache.poi.xslf.extractor;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.poi.POIXMLTextExtractor;
|
import org.apache.poi.POIXMLTextExtractor;
|
||||||
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
|
||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.xslf.XSLFSlideShow;
|
import org.apache.poi.xslf.XSLFSlideShow;
|
||||||
|
import org.apache.poi.xslf.usermodel.DrawingParagraph;
|
||||||
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
import org.apache.poi.xslf.usermodel.XMLSlideShow;
|
||||||
|
import org.apache.poi.xslf.usermodel.XSLFCommonSlideData;
|
||||||
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
import org.apache.poi.xslf.usermodel.XSLFSlide;
|
||||||
import org.apache.xmlbeans.XmlException;
|
import org.apache.xmlbeans.XmlException;
|
||||||
import org.apache.xmlbeans.XmlObject;
|
import org.openxmlformats.schemas.presentationml.x2006.main.*;
|
||||||
import org.apache.xmlbeans.XmlCursor;
|
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
|
import java.io.IOException;
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
|
||||||
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextLineBreak;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
|
|
||||||
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
|
|
||||||
|
|
||||||
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||||
private XMLSlideShow slideshow;
|
private XMLSlideShow slideshow;
|
||||||
|
@ -110,7 +100,7 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||||
slideshow._getXSLFSlideShow().getSlideComments(slideId);
|
slideshow._getXSLFSlideShow().getSlideComments(slideId);
|
||||||
|
|
||||||
if(slideText) {
|
if(slideText) {
|
||||||
extractText(rawSlide.getCSld().getSpTree(), text);
|
extractText(slides[i].getCommonSlideData(), text);
|
||||||
|
|
||||||
// Comments too for the slide
|
// Comments too for the slide
|
||||||
if(comments != null) {
|
if(comments != null) {
|
||||||
|
@ -123,8 +113,9 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(notesText && notes != null) {
|
if(notesText && notes != null) {
|
||||||
extractText(notes.getCSld().getSpTree(), text);
|
extractText(new XSLFCommonSlideData(notes.getCSld()), text);
|
||||||
}
|
}
|
||||||
} catch(Exception e) {
|
} catch(Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
@ -134,31 +125,10 @@ public class XSLFPowerPointExtractor extends POIXMLTextExtractor {
|
||||||
return text.toString();
|
return text.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void extractText(CTGroupShape gs, StringBuffer text) {
|
private void extractText(XSLFCommonSlideData data, StringBuffer text) {
|
||||||
CTShape[] shapes = gs.getSpArray();
|
for (DrawingParagraph p : data.getText()) {
|
||||||
for (int i = 0; i < shapes.length; i++) {
|
text.append(p.getText());
|
||||||
CTTextBody textBody =
|
text.append("\n");
|
||||||
shapes[i].getTxBody();
|
}
|
||||||
if(textBody != null) {
|
}
|
||||||
CTTextParagraph[] paras =
|
|
||||||
textBody.getPArray();
|
|
||||||
for (int j = 0; j < paras.length; j++) {
|
|
||||||
XmlCursor c = paras[j].newCursor();
|
|
||||||
c.selectPath("./*");
|
|
||||||
while (c.toNextSelection()) {
|
|
||||||
XmlObject o = c.getObject();
|
|
||||||
if(o instanceof CTRegularTextRun){
|
|
||||||
CTRegularTextRun txrun = (CTRegularTextRun)o;
|
|
||||||
text.append( txrun.getT() );
|
|
||||||
} else if (o instanceof CTTextLineBreak){
|
|
||||||
text.append('\n');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// End each paragraph with a new line
|
|
||||||
text.append("\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextLineBreak;
|
||||||
|
import org.apache.xmlbeans.XmlCursor;
|
||||||
|
import org.apache.xmlbeans.XmlObject;
|
||||||
|
|
||||||
|
public class DrawingParagraph {
|
||||||
|
private final CTTextParagraph p;
|
||||||
|
|
||||||
|
public DrawingParagraph(CTTextParagraph p) {
|
||||||
|
this.p = p;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CharSequence getText() {
|
||||||
|
StringBuilder text = new StringBuilder();
|
||||||
|
|
||||||
|
XmlCursor c = p.newCursor();
|
||||||
|
c.selectPath("./*");
|
||||||
|
while (c.toNextSelection()) {
|
||||||
|
XmlObject o = c.getObject();
|
||||||
|
if (o instanceof CTRegularTextRun) {
|
||||||
|
CTRegularTextRun txrun = (CTRegularTextRun) o;
|
||||||
|
text.append(txrun.getT());
|
||||||
|
} else if (o instanceof CTTextLineBreak) {
|
||||||
|
text.append('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTableRow;
|
||||||
|
|
||||||
|
public class DrawingTable {
|
||||||
|
private final CTTable table;
|
||||||
|
|
||||||
|
public DrawingTable(CTTable table) {
|
||||||
|
this.table = table;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DrawingTableRow[] getRows() {
|
||||||
|
CTTableRow[] ctTableRows = table.getTrArray();
|
||||||
|
DrawingTableRow[] o = new DrawingTableRow[ctTableRows.length];
|
||||||
|
|
||||||
|
for (int i=0; i<o.length; i++) {
|
||||||
|
o[i] = new DrawingTableRow(ctTableRows[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTableCell;
|
||||||
|
|
||||||
|
public class DrawingTableCell {
|
||||||
|
private final CTTableCell cell;
|
||||||
|
private final DrawingTextBody drawingTextBody;
|
||||||
|
|
||||||
|
public DrawingTableCell(CTTableCell cell) {
|
||||||
|
this.cell = cell;
|
||||||
|
drawingTextBody = new DrawingTextBody(this.cell.getTxBody());
|
||||||
|
}
|
||||||
|
|
||||||
|
public DrawingTextBody getTextBody() {
|
||||||
|
return drawingTextBody;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTableRow;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTableCell;
|
||||||
|
|
||||||
|
public class DrawingTableRow {
|
||||||
|
private final CTTableRow row;
|
||||||
|
|
||||||
|
public DrawingTableRow(CTTableRow row) {
|
||||||
|
this.row = row;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DrawingTableCell[] getCells() {
|
||||||
|
CTTableCell[] ctTableCells = row.getTcArray();
|
||||||
|
DrawingTableCell[] o = new DrawingTableCell[ctTableCells.length];
|
||||||
|
|
||||||
|
for (int i=0; i<o.length; i++) {
|
||||||
|
o[i] = new DrawingTableCell(ctTableCells[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
|
||||||
|
|
||||||
|
public class DrawingTextBody {
|
||||||
|
private final CTTextBody textBody;
|
||||||
|
|
||||||
|
public DrawingTextBody(CTTextBody textBody) {
|
||||||
|
this.textBody = textBody;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DrawingParagraph[] getParagraphs() {
|
||||||
|
CTTextParagraph[] pArray = textBody.getPArray();
|
||||||
|
DrawingParagraph[] o = new DrawingParagraph[pArray.length];
|
||||||
|
|
||||||
|
for (int i=0; i<o.length; i++) {
|
||||||
|
o[i] = new DrawingParagraph(pArray[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,67 @@
|
||||||
|
package org.apache.poi.xslf.usermodel;
|
||||||
|
|
||||||
|
import org.apache.xmlbeans.XmlCursor;
|
||||||
|
import org.apache.xmlbeans.XmlObject;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTable;
|
||||||
|
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTCommonSlideData;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
|
||||||
|
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class XSLFCommonSlideData {
|
||||||
|
private final CTCommonSlideData data;
|
||||||
|
|
||||||
|
public XSLFCommonSlideData(CTCommonSlideData data) {
|
||||||
|
this.data = data;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<DrawingParagraph> getText() {
|
||||||
|
CTGroupShape gs = data.getSpTree();
|
||||||
|
|
||||||
|
List<DrawingParagraph> out = new ArrayList<DrawingParagraph>();
|
||||||
|
|
||||||
|
CTShape[] shapes = gs.getSpArray();
|
||||||
|
for (int i = 0; i < shapes.length; i++) {
|
||||||
|
CTTextBody ctTextBody = shapes[i].getTxBody();
|
||||||
|
if (ctTextBody==null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
DrawingTextBody textBody = new DrawingTextBody(ctTextBody);
|
||||||
|
|
||||||
|
out.addAll(Arrays.asList(textBody.getParagraphs()));
|
||||||
|
}
|
||||||
|
|
||||||
|
CTGraphicalObjectFrame[] graphicFrames = gs.getGraphicFrameArray();
|
||||||
|
for (CTGraphicalObjectFrame frame: graphicFrames) {
|
||||||
|
CTGraphicalObjectData data = frame.getGraphic().getGraphicData();
|
||||||
|
XmlCursor c = data.newCursor();
|
||||||
|
c.selectPath("./*");
|
||||||
|
|
||||||
|
while (c.toNextSelection()) {
|
||||||
|
XmlObject o = c.getObject();
|
||||||
|
|
||||||
|
if (o instanceof CTTable) {
|
||||||
|
DrawingTable table = new DrawingTable((CTTable) o);
|
||||||
|
|
||||||
|
for (DrawingTableRow row : table.getRows()) {
|
||||||
|
for (DrawingTableCell cell : row.getCells()) {
|
||||||
|
DrawingTextBody textBody = cell.getTextBody();
|
||||||
|
|
||||||
|
out.addAll(Arrays.asList(textBody.getParagraphs()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -26,11 +26,13 @@ import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
|
||||||
public class XSLFSlide extends XSLFSheet implements Slide {
|
public class XSLFSlide extends XSLFSheet implements Slide {
|
||||||
private CTSlide slide;
|
private CTSlide slide;
|
||||||
private CTSlideIdListEntry slideId;
|
private CTSlideIdListEntry slideId;
|
||||||
|
private XSLFCommonSlideData data;
|
||||||
|
|
||||||
public XSLFSlide(CTSlide slide, CTSlideIdListEntry slideId, SlideShow parent) {
|
public XSLFSlide(CTSlide slide, CTSlideIdListEntry slideId, SlideShow parent) {
|
||||||
super(parent);
|
super(parent);
|
||||||
this.slide = slide;
|
this.slide = slide;
|
||||||
this.slideId = slideId;
|
this.slideId = slideId;
|
||||||
|
this.data = new XSLFCommonSlideData(slide.getCSld());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -88,4 +90,8 @@ public class XSLFSlide extends XSLFSheet implements Slide {
|
||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public XSLFCommonSlideData getCommonSlideData() {
|
||||||
|
return data;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -113,4 +113,17 @@ public class TestXSLFPowerPointExtractor extends TestCase {
|
||||||
// Check comments are there
|
// Check comments are there
|
||||||
assertTrue("Unable to find expected word in text\n" + text, text.contains("testdoc"));
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("testdoc"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTable() throws Exception {
|
||||||
|
POIDataSamples slTests = POIDataSamples.getSlideShowInstance();
|
||||||
|
xmlA = new XSLFSlideShow(OPCPackage.open(slTests.openResourceAsStream("present1.pptx")));
|
||||||
|
XSLFPowerPointExtractor extractor =
|
||||||
|
new XSLFPowerPointExtractor(xmlA);
|
||||||
|
|
||||||
|
String text = extractor.getText();
|
||||||
|
assertTrue(text.length() > 0);
|
||||||
|
|
||||||
|
// Check comments are there
|
||||||
|
assertTrue("Unable to find expected word in text\n" + text, text.contains("TEST"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue