mirror of https://github.com/apache/poi.git
Bug 60003 - Regression: HSLF Powerpoint text extractor from footer of master slide
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1763927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b2b0e88cd1
commit
76c87681a6
|
@ -17,21 +17,43 @@
|
|||
|
||||
package org.apache.poi.hslf.extractor;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hslf.model.*;
|
||||
import org.apache.poi.hslf.usermodel.*;
|
||||
import org.apache.poi.poifs.filesystem.*;
|
||||
import org.apache.poi.hslf.model.Comment;
|
||||
import org.apache.poi.hslf.model.HSLFMetroShape;
|
||||
import org.apache.poi.hslf.model.HeadersFooters;
|
||||
import org.apache.poi.hslf.model.OLEShape;
|
||||
import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
|
||||
import org.apache.poi.hslf.usermodel.HSLFNotes;
|
||||
import org.apache.poi.hslf.usermodel.HSLFShape;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlide;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideMaster;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
|
||||
import org.apache.poi.hslf.usermodel.HSLFSlideShowImpl;
|
||||
import org.apache.poi.hslf.usermodel.HSLFTable;
|
||||
import org.apache.poi.hslf.usermodel.HSLFTableCell;
|
||||
import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
|
||||
import org.apache.poi.hslf.usermodel.HSLFTextShape;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* This class can be used to extract text from a PowerPoint file. Can optionally
|
||||
* also get the notes from one.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
||||
private static final POILogger LOG = POILogFactory.getLogger(PowerPointExtractor.class);
|
||||
|
||||
private final HSLFSlideShowImpl _hslfshow;
|
||||
private final HSLFSlideShow _show;
|
||||
private final List<HSLFSlide> _slides;
|
||||
|
@ -207,20 +229,27 @@ public final class PowerPointExtractor extends POIOLE2TextExtractor {
|
|||
for (HSLFSlideMaster master : _show.getSlideMasters()) {
|
||||
for(HSLFShape sh : master.getShapes()){
|
||||
if(sh instanceof HSLFTextShape){
|
||||
if(HSLFMasterSheet.isPlaceholder(sh)) {
|
||||
// don't bother about boiler
|
||||
// plate text on master
|
||||
// sheets
|
||||
HSLFTextShape hsh = (HSLFTextShape)sh;
|
||||
final String text = hsh.getText();
|
||||
if (text == null || "".equals(text) || "*".equals(text)) {
|
||||
continue;
|
||||
}
|
||||
HSLFTextShape tsh = (HSLFTextShape)sh;
|
||||
String text = tsh.getText();
|
||||
if (text != null){
|
||||
ret.append(text);
|
||||
if (!text.endsWith("\n")) {
|
||||
ret.append("\n");
|
||||
|
||||
if (HSLFMasterSheet.isPlaceholder(sh)) {
|
||||
// check for metro shape of complex placeholder
|
||||
boolean isMetro = new HSLFMetroShape<HSLFShape>(sh).hasMetroBlob();
|
||||
|
||||
if (!isMetro) {
|
||||
// don't bother about boiler plate text on master sheets
|
||||
LOG.log(POILogger.INFO, "Ignoring boiler plate (placeholder) text on slide master:", text);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ret.append(text);
|
||||
if (!text.endsWith("\n")) {
|
||||
ret.append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,14 +47,20 @@ public class HSLFMetroShape<T extends Shape<?,?>> {
|
|||
* @return the bytes of the metro blob, which are bytes of an OPCPackage, i.e. a zip stream
|
||||
*/
|
||||
public byte[] getMetroBytes() {
|
||||
EscherComplexProperty ep = getMetroProp();
|
||||
return (ep == null) ? null : ep.getComplexData();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return if there's a metro blob to extract
|
||||
*/
|
||||
public boolean hasMetroBlob() {
|
||||
return getMetroProp() != null;
|
||||
}
|
||||
|
||||
private EscherComplexProperty getMetroProp() {
|
||||
AbstractEscherOptRecord opt = shape.getEscherChild(EscherTertiaryOptRecord.RECORD_ID);
|
||||
if (opt != null) {
|
||||
EscherComplexProperty ep = (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
|
||||
if (ep != null) {
|
||||
return ep.getComplexData();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
return (opt == null) ? null : (EscherComplexProperty)opt.lookup(EscherProperties.GROUPSHAPE__METROBLOB);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -431,5 +431,19 @@ public final class TestExtractor {
|
|||
String target = "this\tText\tis\twithin\ta\n"+
|
||||
"table\t1\t2\t3\t4";
|
||||
assertTrue(text.contains(target));
|
||||
}
|
||||
}
|
||||
|
||||
// bug 60003
|
||||
@Test
|
||||
public void testExtractMasterSlideFooterText() throws Exception {
|
||||
HSLFSlideShowImpl hslf = new HSLFSlideShowImpl(slTests.openResourceAsStream("60003.ppt"));
|
||||
ppe.close();
|
||||
|
||||
ppe = new PowerPointExtractor(hslf);
|
||||
ppe.setMasterByDefault(true);
|
||||
|
||||
String text = ppe.getText();
|
||||
assertContains(text, "Prague");
|
||||
hslf.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue