Bug 60003 - Regression: HSLF Powerpoint text extractor from footer of master slide

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1763927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andreas Beeker 2016-10-08 18:08:25 +00:00
parent b2b0e88cd1
commit 76c87681a6
3 changed files with 74 additions and 25 deletions

View File

@ -17,21 +17,43 @@
package org.apache.poi.hslf.extractor; package org.apache.poi.hslf.extractor;
import java.io.*; import java.io.File;
import java.util.*; import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hslf.model.*; import org.apache.poi.hslf.model.Comment;
import org.apache.poi.hslf.usermodel.*; import org.apache.poi.hslf.model.HSLFMetroShape;
import org.apache.poi.poifs.filesystem.*; import org.apache.poi.hslf.model.HeadersFooters;
import org.apache.poi.hslf.model.OLEShape;
import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
import org.apache.poi.hslf.usermodel.HSLFNotes;
import org.apache.poi.hslf.usermodel.HSLFShape;
import org.apache.poi.hslf.usermodel.HSLFSlide;
import org.apache.poi.hslf.usermodel.HSLFSlideMaster;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
import org.apache.poi.hslf.usermodel.HSLFTable;
import org.apache.poi.hslf.usermodel.HSLFTableCell;
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/** /**
* This class can be used to extract text from a PowerPoint file. Can optionally * This class can be used to extract text from a PowerPoint file. Can optionally
* also get the notes from one. * also get the notes from one.
*
* @author Nick Burch
*/ */
public final class PowerPointExtractor extends POIOLE2TextExtractor { public final class PowerPointExtractor extends POIOLE2TextExtractor {
private static final POILogger LOG = POILogFactory.getLogger(PowerPointExtractor.class);
private final HSLFSlideShowImpl _hslfshow; private final HSLFSlideShowImpl _hslfshow;
private final HSLFSlideShow _show; private final HSLFSlideShow _show;
private final List<HSLFSlide> _slides; private final List<HSLFSlide> _slides;
@ -207,20 +229,27 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
for (HSLFSlideMaster master : _show.getSlideMasters()) { for (HSLFSlideMaster master : _show.getSlideMasters()) {
for(HSLFShape sh : master.getShapes()){ for(HSLFShape sh : master.getShapes()){
if(sh instanceof HSLFTextShape){ if(sh instanceof HSLFTextShape){
if(HSLFMasterSheet.isPlaceholder(sh)) { HSLFTextShape hsh = (HSLFTextShape)sh;
// don't bother about boiler final String text = hsh.getText();
// plate text on master if (text == null || "".equals(text) || "*".equals(text)) {
// sheets
continue; continue;
} }
HSLFTextShape tsh = (HSLFTextShape)sh;
String text = tsh.getText(); if (HSLFMasterSheet.isPlaceholder(sh)) {
if (text != null){ // check for metro shape of complex placeholder
ret.append(text); boolean isMetro = new HSLFMetroShape<HSLFShape>(sh).hasMetroBlob();
if (!text.endsWith("\n")) {
ret.append("\n"); if (!isMetro) {
// don't bother about boiler plate text on master sheets
LOG.log(POILogger.INFO, "Ignoring boiler plate (placeholder) text on slide master:", text);
continue;
} }
} }
ret.append(text);
if (!text.endsWith("\n")) {
ret.append("\n");
}
} }
} }
} }

View File

@ -47,14 +47,20 @@ public class HSLFMetroShape<T extends Shape<?,?>> {
* @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream * @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream
*/ */
public byte[] getMetroBytes() { public byte[] getMetroBytes() {
EscherComplexProperty ep = getMetroProp();
return (ep == null) ? null : ep.getComplexData();
}
/**
* @return if there's a metro blob to extract
*/
public boolean hasMetroBlob() {
return getMetroProp() != null;
}
private EscherComplexProperty getMetroProp() {
AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID); AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID);
if (opt != null) { return (opt == null) ? null : (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
EscherComplexProperty ep = (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
if (ep != null) {
return ep.getComplexData();
}
}
return null;
} }
/** /**

View File

@ -432,4 +432,18 @@ public final class TestExtractor {
"table\t1\t2\t3\t4"; "table\t1\t2\t3\t4";
assertTrue(text.contains(target)); assertTrue(text.contains(target));
} }
// bug 60003
@Test
public void testExtractMasterSlideFooterText() throws Exception {
HSLFSlideShowImpl hslf = new HSLFSlideShowImpl(slTests.openResourceAsStream("60003.ppt"));
ppe.close();
ppe = new PowerPointExtractor(hslf);
ppe.setMasterByDefault(true);
String text = ppe.getText();
assertContains(text, "Prague");
hslf.close();
}
} }