mirror of https://github.com/apache/poi.git
QuickButCruddyTextExtractor - gets all the text (including stuff you might not want), but does it fast
git-svn-id: https://svn.apache.org/repos/asf/jakarta/poi/trunk@353710 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
396e06e685
commit
0c65c84e10
|
@ -126,7 +126,9 @@ public class PowerPointExtractor
|
|||
}
|
||||
|
||||
/**
|
||||
* Fetches text from the slideshow, be it slide text or note text
|
||||
* Fetches text from the slideshow, be it slide text or note text.
|
||||
* Because the final block of text in a TextRun normally have their
|
||||
* last \n stripped, we add it back
|
||||
* @param getSlideText fetch slide text
|
||||
* @param getNoteText fetch note text
|
||||
*/
|
||||
|
@ -139,10 +141,12 @@ public class PowerPointExtractor
|
|||
TextRun[] runs = slide.getTextRuns();
|
||||
for(int j=0; j<runs.length; j++) {
|
||||
TextRun run = runs[j];
|
||||
String text = run.getText();
|
||||
ret.append(text);
|
||||
if(! text.endsWith("\n")) {
|
||||
ret.append("\n");
|
||||
if(run != null) {
|
||||
String text = run.getText();
|
||||
ret.append(text);
|
||||
if(! text.endsWith("\n")) {
|
||||
ret.append("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,205 @@
|
|||
|
||||
/* ====================================================================
|
||||
Copyright 2002-2004 Apache Software Foundation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
|
||||
|
||||
package org.apache.poi.hslf.extractor;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Vector;
|
||||
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.POIFSDocument;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.DocumentInputStream;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
import org.apache.poi.hslf.record.Record;
|
||||
import org.apache.poi.hslf.record.TextHeaderAtom;
|
||||
import org.apache.poi.hslf.record.TextBytesAtom;
|
||||
import org.apache.poi.hslf.record.TextCharsAtom;
|
||||
import org.apache.poi.hslf.model.TextRun;
|
||||
|
||||
/**
|
||||
* This class will get all the text from a Powerpoint Document, including
|
||||
* all the bits you didn't want, and in a somewhat random order, but will
|
||||
* do it very fast.
|
||||
* The class ignores most of the hslf classes, and doesn't use
|
||||
* HSLFSlideShow. Instead, it just does a very basic scan through the
|
||||
* file, grabbing all the text records as it goes. It then returns the
|
||||
* text, either as a single string, or as a vector of all the individual
|
||||
* strings.
|
||||
* Because of how it works, it will return a lot of "crud" text that you
|
||||
* probably didn't want! It will return text from master slides. It will
|
||||
* return duplicate text, and some mangled text (powerpoint files often
|
||||
* have duplicate copies of slide text in them). You don't get any idea
|
||||
* what the text was associated with.
|
||||
* Almost everyone will want to use @see PowerPointExtractor instead. There
|
||||
* are only a very small number of cases (eg some performance sensitive
|
||||
* lucene indexers) that would ever want to use this!
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
|
||||
public class QuickButCruddyTextExtractor
|
||||
{
|
||||
private POIFSFileSystem fs;
|
||||
private InputStream is;
|
||||
private byte[] pptContents;
|
||||
|
||||
/**
|
||||
* Really basic text extractor, that will also return lots of crud text.
|
||||
* Takes a single argument, the file to extract from
|
||||
*/
|
||||
public static void main(String args[]) throws IOException
|
||||
{
|
||||
if(args.length < 1) {
|
||||
System.err.println("Useage:");
|
||||
System.err.println("\tQuickButCruddyTextExtractor <file>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
String file = args[0];
|
||||
|
||||
QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file);
|
||||
System.out.println(ppe.getTextAsString());
|
||||
ppe.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an extractor from a given file name
|
||||
* @param fileName
|
||||
*/
|
||||
public QuickButCruddyTextExtractor(String fileName) throws IOException {
|
||||
this(new FileInputStream(fileName));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an extractor from a given input stream
|
||||
* @param iStream
|
||||
*/
|
||||
public QuickButCruddyTextExtractor(InputStream iStream) throws IOException {
|
||||
this(new POIFSFileSystem(iStream));
|
||||
is = iStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an extractor from a POIFS Filesystem
|
||||
* @param poifs
|
||||
*/
|
||||
public QuickButCruddyTextExtractor(POIFSFileSystem poifs) throws IOException {
|
||||
fs = poifs;
|
||||
|
||||
// Find the PowerPoint bit, and get out the bytes
|
||||
DocumentEntry docProps =
|
||||
(DocumentEntry)fs.getRoot().getEntry("PowerPoint Document");
|
||||
pptContents = new byte[docProps.getSize()];
|
||||
fs.createDocumentInputStream("PowerPoint Document").read(pptContents);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Shuts down the underlying streams
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
if(is != null) { is.close(); }
|
||||
fs = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the ALL the text of the powerpoint file, as a single string
|
||||
*/
|
||||
public String getTextAsString() {
|
||||
StringBuffer ret = new StringBuffer();
|
||||
Vector textV = getTextAsVector();
|
||||
for(int i=0; i<textV.size(); i++) {
|
||||
String text = (String)textV.get(i);
|
||||
ret.append(text);
|
||||
if(! text.endsWith("\n")) {
|
||||
ret.append('\n');
|
||||
}
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the ALL the text of the powerpoint file, in a vector of
|
||||
* strings, one per text record
|
||||
*/
|
||||
public Vector getTextAsVector() {
|
||||
Vector textV = new Vector();
|
||||
|
||||
// Set to the start of the file
|
||||
int walkPos = 0;
|
||||
|
||||
// Start walking the file, looking for the records
|
||||
while(walkPos != -1) {
|
||||
int newPos = findTextRecords(walkPos,textV);
|
||||
walkPos = newPos;
|
||||
}
|
||||
|
||||
// Return what we find
|
||||
return textV;
|
||||
}
|
||||
|
||||
/**
|
||||
* For the given position, look if the record is a text record, and wind
|
||||
* on after.
|
||||
* If it is a text record, grabs out the text. Whatever happens, returns
|
||||
* the position of the next record, or -1 if no more.
|
||||
*/
|
||||
public int findTextRecords(int startPos, Vector textV) {
|
||||
// Grab the length, and the first option byte
|
||||
// Note that the length doesn't include the 8 byte atom header
|
||||
int len = (int)LittleEndian.getUInt(pptContents,startPos+4);
|
||||
byte opt = pptContents[startPos];
|
||||
|
||||
// If it's a container, step into it and return
|
||||
// (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
|
||||
int container = (int)opt & 0x0f;
|
||||
if(container == 0x0f) {
|
||||
return (startPos+8);
|
||||
}
|
||||
|
||||
// Otherwise, check the type to see if it's text
|
||||
long type = LittleEndian.getUShort(pptContents,startPos+2);
|
||||
TextRun trun = null;
|
||||
|
||||
// TextBytesAtom
|
||||
if(type == 4008l) {
|
||||
TextBytesAtom tba = (TextBytesAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
|
||||
trun = new TextRun((TextHeaderAtom)null,tba);
|
||||
}
|
||||
// TextCharsAtom
|
||||
if(type == 4000l) {
|
||||
TextCharsAtom tca = (TextCharsAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
|
||||
trun = new TextRun((TextHeaderAtom)null,tca);
|
||||
}
|
||||
|
||||
// If we found text, save it in the vector
|
||||
if(trun != null) {
|
||||
textV.add(trun.getText());
|
||||
}
|
||||
|
||||
// Wind on by the atom length, and check we're not at the end
|
||||
int newPos = (startPos + 8 + len);
|
||||
if(newPos > (pptContents.length - 8)) {
|
||||
newPos = -1;
|
||||
}
|
||||
return newPos;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue