mirror of https://github.com/apache/poi.git
Bug 60003 - Regression: HSLF Powerpoint text extractor from footer of master slide
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1763927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b2b0e88cd1
commit
76c87681a6
|
@ -17,21 +17,43 @@
|
||||||
|
|
||||||
package org.apache.poi.hslf.extractor;
|
package org.apache.poi.hslf.extractor;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.File;
|
||||||
import java.util.*;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.poi.POIOLE2TextExtractor;
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
import org.apache.poi.hslf.model.*;
|
import org.apache.poi.hslf.model.Comment;
|
||||||
import org.apache.poi.hslf.usermodel.*;
|
import org.apache.poi.hslf.model.HSLFMetroShape;
|
||||||
import org.apache.poi.poifs.filesystem.*;
|
import org.apache.poi.hslf.model.HeadersFooters;
|
||||||
|
import org.apache.poi.hslf.model.OLEShape;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFNotes;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFShape;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFSlide;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFSlideMaster;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFTable;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFTableCell;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
|
||||||
|
import org.apache.poi.hslf.usermodel.HSLFTextShape;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.util.POILogFactory;
|
||||||
|
import org.apache.poi.util.POILogger;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class can be used to extract text from a PowerPoint file. Can optionally
|
* This class can be used to extract text from a PowerPoint file. Can optionally
|
||||||
* also get the notes from one.
|
* also get the notes from one.
|
||||||
*
|
|
||||||
* @author Nick Burch
|
|
||||||
*/
|
*/
|
||||||
public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
||||||
|
private static final POILogger LOG = POILogFactory.getLogger(PowerPointExtractor.class);
|
||||||
|
|
||||||
private final HSLFSlideShowImpl _hslfshow;
|
private final HSLFSlideShowImpl _hslfshow;
|
||||||
private final HSLFSlideShow _show;
|
private final HSLFSlideShow _show;
|
||||||
private final List<HSLFSlide> _slides;
|
private final List<HSLFSlide> _slides;
|
||||||
|
@ -207,20 +229,27 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
||||||
for (HSLFSlideMaster master : _show.getSlideMasters()) {
|
for (HSLFSlideMaster master : _show.getSlideMasters()) {
|
||||||
for(HSLFShape sh : master.getShapes()){
|
for(HSLFShape sh : master.getShapes()){
|
||||||
if(sh instanceof HSLFTextShape){
|
if(sh instanceof HSLFTextShape){
|
||||||
if(HSLFMasterSheet.isPlaceholder(sh)) {
|
HSLFTextShape hsh = (HSLFTextShape)sh;
|
||||||
// don't bother about boiler
|
final String text = hsh.getText();
|
||||||
// plate text on master
|
if (text == null || "".equals(text) || "*".equals(text)) {
|
||||||
// sheets
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
HSLFTextShape tsh = (HSLFTextShape)sh;
|
|
||||||
String text = tsh.getText();
|
if (HSLFMasterSheet.isPlaceholder(sh)) {
|
||||||
if (text != null){
|
// check for metro shape of complex placeholder
|
||||||
ret.append(text);
|
boolean isMetro = new HSLFMetroShape<HSLFShape>(sh).hasMetroBlob();
|
||||||
if (!text.endsWith("\n")) {
|
|
||||||
ret.append("\n");
|
if (!isMetro) {
|
||||||
|
// don't bother about boiler plate text on master sheets
|
||||||
|
LOG.log(POILogger.INFO, "Ignoring boiler plate (placeholder) text on slide master:", text);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret.append(text);
|
||||||
|
if (!text.endsWith("\n")) {
|
||||||
|
ret.append("\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,14 +47,20 @@ public class HSLFMetroShape<T extends Shape<?,?>> {
|
||||||
* @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream
|
* @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream
|
||||||
*/
|
*/
|
||||||
public byte[] getMetroBytes() {
|
public byte[] getMetroBytes() {
|
||||||
|
EscherComplexProperty ep = getMetroProp();
|
||||||
|
return (ep == null) ? null : ep.getComplexData();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return if there's a metro blob to extract
|
||||||
|
*/
|
||||||
|
public boolean hasMetroBlob() {
|
||||||
|
return getMetroProp() != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private EscherComplexProperty getMetroProp() {
|
||||||
AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID);
|
AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID);
|
||||||
if (opt != null) {
|
return (opt == null) ? null : (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
|
||||||
EscherComplexProperty ep = (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
|
|
||||||
if (ep != null) {
|
|
||||||
return ep.getComplexData();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -432,4 +432,18 @@ public final class TestExtractor {
|
||||||
"table\t1\t2\t3\t4";
|
"table\t1\t2\t3\t4";
|
||||||
assertTrue(text.contains(target));
|
assertTrue(text.contains(target));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// bug 60003
|
||||||
|
@Test
|
||||||
|
public void testExtractMasterSlideFooterText() throws Exception {
|
||||||
|
HSLFSlideShowImpl hslf = new HSLFSlideShowImpl(slTests.openResourceAsStream("60003.ppt"));
|
||||||
|
ppe.close();
|
||||||
|
|
||||||
|
ppe = new PowerPointExtractor(hslf);
|
||||||
|
ppe.setMasterByDefault(true);
|
||||||
|
|
||||||
|
String text = ppe.getText();
|
||||||
|
assertContains(text, "Prague");
|
||||||
|
hslf.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue