convert some tabs to spaces

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1871921 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
PJ Fanning 2019-12-23 09:18:38 +00:00
parent 66471836f5
commit 93a7b81ed9
5 changed files with 530 additions and 530 deletions

View File

@ -34,49 +34,49 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
* file format.
*/
public final class HPBFDocument extends POIReadOnlyDocument {
private MainContents mainContents;
private QuillContents quillContents;
private EscherStm escherStm;
private EscherDelayStm escherDelayStm;
private MainContents mainContents;
private QuillContents quillContents;
private EscherStm escherStm;
private EscherDelayStm escherDelayStm;
/**
* Opens a new publisher document
*/
public HPBFDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
/**
* Opens a new publisher document
*/
public HPBFDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
}
public HPBFDocument(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
public HPBFDocument(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
/**
* Opens an embedded publisher document,
* at the given directory.
*/
public HPBFDocument(DirectoryNode dir) throws IOException {
super(dir);
/**
* Opens an embedded publisher document,
* at the given directory.
*/
public HPBFDocument(DirectoryNode dir) throws IOException {
super(dir);
// Go looking for our interesting child
// streams
mainContents = new MainContents(dir);
quillContents = new QuillContents(dir);
// Go looking for our interesting child
// streams
mainContents = new MainContents(dir);
quillContents = new QuillContents(dir);
// Now the Escher bits
escherStm = new EscherStm(dir);
escherDelayStm = new EscherDelayStm(dir);
}
// Now the Escher bits
escherStm = new EscherStm(dir);
escherDelayStm = new EscherDelayStm(dir);
}
public MainContents getMainContents() {
return mainContents;
}
public QuillContents getQuillContents() {
return quillContents;
}
public EscherStm getEscherStm() {
return escherStm;
}
public EscherDelayStm getEscherDelayStm() {
return escherDelayStm;
}
public MainContents getMainContents() {
return mainContents;
}
public QuillContents getQuillContents() {
return quillContents;
}
public EscherStm getEscherStm() {
return escherStm;
}
public EscherDelayStm getEscherDelayStm() {
return escherDelayStm;
}
}

View File

@ -36,319 +36,319 @@ import org.apache.poi.util.StringUtil;
* constructed.
*/
public final class HPBFDumper {
private POIFSFileSystem fs;
public HPBFDumper(POIFSFileSystem fs) {
this.fs = fs;
}
@SuppressWarnings("resource")
private POIFSFileSystem fs;
public HPBFDumper(POIFSFileSystem fs) {
this.fs = fs;
}
@SuppressWarnings("resource")
public HPBFDumper(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
this(new POIFSFileSystem(inp));
}
private static byte[] getData(DirectoryNode dir, String name) throws IOException {
// Grab the document stream
InputStream is = dir.createDocumentInputStream(name);
byte[] d = IOUtils.toByteArray(is);
is.close();
private static byte[] getData(DirectoryNode dir, String name) throws IOException {
// Grab the document stream
InputStream is = dir.createDocumentInputStream(name);
byte[] d = IOUtils.toByteArray(is);
is.close();
// All done
return d;
}
// All done
return d;
}
/**
* Dumps out the given number of bytes as hex,
* two chars
*/
private String dumpBytes(byte[] data, int offset, int len) {
StringBuilder ret = new StringBuilder();
for(int i=0; i<len; i++) {
int j = i + offset;
int b = data[j];
if(b < 0) { b += 256; }
/**
* Dumps out the given number of bytes as hex,
* two chars
*/
private String dumpBytes(byte[] data, int offset, int len) {
StringBuilder ret = new StringBuilder();
for(int i=0; i<len; i++) {
int j = i + offset;
int b = data[j];
if(b < 0) { b += 256; }
String bs = Integer.toHexString(b);
if(bs.length() == 1)
ret.append('0');
ret.append(bs);
ret.append(' ');
}
return ret.toString();
}
String bs = Integer.toHexString(b);
if(bs.length() == 1)
ret.append('0');
ret.append(bs);
ret.append(' ');
}
return ret.toString();
}
@SuppressWarnings("resource")
@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" HPBFDumper <filename>");
System.exit(1);
}
HPBFDumper dump = new HPBFDumper(new POIFSFileSystem(new File(args[0])));
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" HPBFDumper <filename>");
System.exit(1);
}
HPBFDumper dump = new HPBFDumper(new POIFSFileSystem(new File(args[0])));
System.out.println("Dumping " + args[0]);
dump.dumpContents();
dump.dumpEnvelope();
dump.dumpEscher();
dump.dump001CompObj(dump.fs.getRoot());
dump.dumpQuill();
System.out.println("Dumping " + args[0]);
dump.dumpContents();
dump.dumpEnvelope();
dump.dumpEscher();
dump.dump001CompObj(dump.fs.getRoot());
dump.dumpQuill();
// Still to go:
// (0x03)Internal
// Objects
}
// Still to go:
// (0x03)Internal
// Objects
}
/**
* Dump out the escher parts of the file.
* Escher -> EscherStm and EscherDelayStm
*/
public void dumpEscher() throws IOException {
DirectoryNode escherDir = (DirectoryNode)
fs.getRoot().getEntry("Escher");
/**
* Dump out the escher parts of the file.
* Escher -> EscherStm and EscherDelayStm
*/
public void dumpEscher() throws IOException {
DirectoryNode escherDir = (DirectoryNode)
fs.getRoot().getEntry("Escher");
dumpEscherStm(escherDir);
dumpEscherDelayStm(escherDir);
}
private void dumpEscherStream(byte[] data) {
DefaultEscherRecordFactory erf =
new DefaultEscherRecordFactory();
dumpEscherStm(escherDir);
dumpEscherDelayStm(escherDir);
}
private void dumpEscherStream(byte[] data) {
DefaultEscherRecordFactory erf =
new DefaultEscherRecordFactory();
// Dump
int left = data.length;
while(left > 0) {
EscherRecord er = erf.createRecord(data, 0);
er.fillFields(data, 0, erf);
left -= er.getRecordSize();
// Dump
int left = data.length;
while(left > 0) {
EscherRecord er = erf.createRecord(data, 0);
er.fillFields(data, 0, erf);
left -= er.getRecordSize();
System.out.println(er);
}
}
protected void dumpEscherStm(DirectoryNode escherDir) throws IOException {
byte[] data = getData(escherDir, "EscherStm");
System.out.println();
System.out.println("EscherStm - " + data.length + " bytes long:");
if(data.length > 0)
dumpEscherStream(data);
}
protected void dumpEscherDelayStm(DirectoryNode escherDir) throws IOException {
byte[] data = getData(escherDir, "EscherDelayStm");
System.out.println();
System.out.println("EscherDelayStm - " + data.length + " bytes long:");
if(data.length > 0)
dumpEscherStream(data);
}
System.out.println(er);
}
}
protected void dumpEscherStm(DirectoryNode escherDir) throws IOException {
byte[] data = getData(escherDir, "EscherStm");
System.out.println();
System.out.println("EscherStm - " + data.length + " bytes long:");
if(data.length > 0)
dumpEscherStream(data);
}
protected void dumpEscherDelayStm(DirectoryNode escherDir) throws IOException {
byte[] data = getData(escherDir, "EscherDelayStm");
System.out.println();
System.out.println("EscherDelayStm - " + data.length + " bytes long:");
if(data.length > 0)
dumpEscherStream(data);
}
public void dumpEnvelope() throws IOException {
byte[] data = getData(fs.getRoot(), "Envelope");
public void dumpEnvelope() throws IOException {
byte[] data = getData(fs.getRoot(), "Envelope");
System.out.println();
System.out.println("Envelope - " + data.length + " bytes long:");
}
System.out.println();
System.out.println("Envelope - " + data.length + " bytes long:");
}
public void dumpContents() throws IOException {
byte[] data = getData(fs.getRoot(), "Contents");
public void dumpContents() throws IOException {
byte[] data = getData(fs.getRoot(), "Contents");
System.out.println();
System.out.println("Contents - " + data.length + " bytes long:");
System.out.println();
System.out.println("Contents - " + data.length + " bytes long:");
// 8 bytes, always seems to be
// E8 AC 2C 00 E8 03 05 01
// E8 AC 2C 00 E8 03 05 01
// 8 bytes, always seems to be
// E8 AC 2C 00 E8 03 05 01
// E8 AC 2C 00 E8 03 05 01
// 4 bytes - size of contents
// 13/15 00 00 01
// 4 bytes - size of contents
// 13/15 00 00 01
// ....
// ....
// E8 03 08 08 0C 20 03 00 00 00 00 88 16 00 00 00 ..... ..........
// E8 03 08 08 0C 20 03 00 00 00 00 88 16 00 00 00 ..... ..........
// 01 18 27 00 03 20 00 00 E8 03 08 08 0C 20 03 00 ..'.. ....... ..
// 01 18 27 00 03 20 00 00 E8 03 08 08 0C 20 03 00 ..'.. ....... ..
// 01 18 30 00 03 20 00 00
// E8 03 06 08 07 08 08 08 09 10 01 00 0C 20 04 00
// 00 00 00 88 1E 00 00 00
// 01 18 30 00 03 20 00 00
// E8 03 06 08 07 08 08 08 09 10 01 00 0C 20 04 00
// 00 00 00 88 1E 00 00 00
// 01 18 31 00 03 20 00 00
// E8 03 06 08 07 08 08 08 09 10 01 00 0C 20 04 00
// 00 00 00 88 1E 00 00 00
// 01 18 31 00 03 20 00 00
// E8 03 06 08 07 08 08 08 09 10 01 00 0C 20 04 00
// 00 00 00 88 1E 00 00 00
// 01 18 32 00 03 20 00 00
// E8 03 06 08 07 08 08 08 09 10 01 00 0C 20 04 00
// 00 00 00 88 1E 00 00 00
}
// 01 18 32 00 03 20 00 00
// E8 03 06 08 07 08 08 08 09 10 01 00 0C 20 04 00
// 00 00 00 88 1E 00 00 00
}
public void dumpCONTENTSraw(DirectoryNode dir) throws IOException {
byte[] data = getData(dir, "CONTENTS");
public void dumpCONTENTSraw(DirectoryNode dir) throws IOException {
byte[] data = getData(dir, "CONTENTS");
System.out.println();
System.out.println("CONTENTS - " + data.length + " bytes long:");
System.out.println();
System.out.println("CONTENTS - " + data.length + " bytes long:");
// Between the start and 0x200 we have
// CHNKINK(space) + 24 bytes
// 0x1800
// TEXT + 6 bytes
// TEXT + 8 bytes
// 0x1800
// STSH + 6 bytes
// STSH + 8 bytes
// 0x1800
// STSH + 6 bytes
// STSH + 8 bytes
// but towards 0x200 the pattern may
// break down a little bit
// Between the start and 0x200 we have
// CHNKINK(space) + 24 bytes
// 0x1800
// TEXT + 6 bytes
// TEXT + 8 bytes
// 0x1800
// STSH + 6 bytes
// STSH + 8 bytes
// 0x1800
// STSH + 6 bytes
// STSH + 8 bytes
// but towards 0x200 the pattern may
// break down a little bit
// After the second of a given type,
// it seems to be 4 bytes giving the start,
// then 4 bytes giving the length, then
// 18 00
System.out.println(
new String(data, 0, 8, LocaleUtil.CHARSET_1252) +
dumpBytes(data, 8, 0x20-8)
);
// After the second of a given type,
// it seems to be 4 bytes giving the start,
// then 4 bytes giving the length, then
// 18 00
System.out.println(
new String(data, 0, 8, LocaleUtil.CHARSET_1252) +
dumpBytes(data, 8, 0x20-8)
);
int pos = 0x20;
boolean sixNotEight = true;
while(pos < 0x200) {
if(sixNotEight) {
System.out.println(
dumpBytes(data, pos, 2)
);
pos += 2;
}
String text = new String(data, pos, 4, LocaleUtil.CHARSET_1252);
int blen = 8;
if(sixNotEight)
blen = 6;
System.out.println(
text + " " + dumpBytes(data, pos+4, blen)
);
int pos = 0x20;
boolean sixNotEight = true;
while(pos < 0x200) {
if(sixNotEight) {
System.out.println(
dumpBytes(data, pos, 2)
);
pos += 2;
}
String text = new String(data, pos, 4, LocaleUtil.CHARSET_1252);
int blen = 8;
if(sixNotEight)
blen = 6;
System.out.println(
text + " " + dumpBytes(data, pos+4, blen)
);
pos += 4 + blen;
sixNotEight = ! sixNotEight;
}
pos += 4 + blen;
sixNotEight = ! sixNotEight;
}
// Text from 0x200 onwards until we get
// to \r(00)\n(00)(00)(00)
int textStop = -1;
for(int i=0x200; i<data.length-2 && textStop == -1; i++) {
if(data[i] == 0 && data[i+1] == 0 && data[i+2] == 0) {
textStop = i;
}
}
if(textStop > 0) {
int len = (textStop - 0x200) / 2;
System.out.println();
System.out.println(
StringUtil.getFromUnicodeLE(data, 0x200, len)
);
}
// Text from 0x200 onwards until we get
// to \r(00)\n(00)(00)(00)
int textStop = -1;
for(int i=0x200; i<data.length-2 && textStop == -1; i++) {
if(data[i] == 0 && data[i+1] == 0 && data[i+2] == 0) {
textStop = i;
}
}
if(textStop > 0) {
int len = (textStop - 0x200) / 2;
System.out.println();
System.out.println(
StringUtil.getFromUnicodeLE(data, 0x200, len)
);
}
// The font list comes slightly later
// The font list comes slightly later
// The hyperlinks may come before the fonts,
// or slightly in front
}
public void dumpCONTENTSguessed(DirectoryNode dir) throws IOException {
byte[] data = getData(dir, "CONTENTS");
// The hyperlinks may come before the fonts,
// or slightly in front
}
public void dumpCONTENTSguessed(DirectoryNode dir) throws IOException {
byte[] data = getData(dir, "CONTENTS");
System.out.println();
System.out.println("CONTENTS - " + data.length + " bytes long:");
System.out.println();
System.out.println("CONTENTS - " + data.length + " bytes long:");
String[] startType = new String[20];
String[] endType = new String[20];
int[] optA = new int[20];
int[] optB = new int[20];
int[] optC = new int[20];
int[] from = new int[20];
int[] len = new int[20];
String[] startType = new String[20];
String[] endType = new String[20];
int[] optA = new int[20];
int[] optB = new int[20];
int[] optC = new int[20];
int[] from = new int[20];
int[] len = new int[20];
for(int i=0; i<20; i++) {
int offset = 0x20 + i*24;
if(data[offset] == 0x18 && data[offset+1] == 0x00) {
// Has data
startType[i] = new String(data, offset+2, 4, LocaleUtil.CHARSET_1252);
optA[i] = LittleEndian.getUShort(data, offset+6);
optB[i] = LittleEndian.getUShort(data, offset+8);
optC[i] = LittleEndian.getUShort(data, offset+10);
endType[i] = new String(data, offset+12, 4, LocaleUtil.CHARSET_1252);
from[i] = (int)LittleEndian.getUInt(data, offset+16);
len[i] = (int)LittleEndian.getUInt(data, offset+20);
} else {
// Doesn't have data
}
}
for(int i=0; i<20; i++) {
int offset = 0x20 + i*24;
if(data[offset] == 0x18 && data[offset+1] == 0x00) {
// Has data
startType[i] = new String(data, offset+2, 4, LocaleUtil.CHARSET_1252);
optA[i] = LittleEndian.getUShort(data, offset+6);
optB[i] = LittleEndian.getUShort(data, offset+8);
optC[i] = LittleEndian.getUShort(data, offset+10);
endType[i] = new String(data, offset+12, 4, LocaleUtil.CHARSET_1252);
from[i] = (int)LittleEndian.getUInt(data, offset+16);
len[i] = (int)LittleEndian.getUInt(data, offset+20);
} else {
// Doesn't have data
}
}
String text = StringUtil.getFromUnicodeLE(
data, from[0], len[0]/2
);
String text = StringUtil.getFromUnicodeLE(
data, from[0], len[0]/2
);
// Dump
for(int i=0; i<20; i++) {
String num = Integer.toString(i);
if(i < 10) {
num = "0" + i;
}
System.out.print(num + " ");
// Dump
for(int i=0; i<20; i++) {
String num = Integer.toString(i);
if(i < 10) {
num = "0" + i;
}
System.out.print(num + " ");
if(startType[i] == null) {
System.out.println("(not present)");
} else {
System.out.println(
"\t" +
startType[i] + " " +
optA[i] + " " +
optB[i] + " " +
optC[i]
);
System.out.println(
"\t" +
endType[i] + " " +
"from: " +
Integer.toHexString(from[i]) +
" (" + from[i] + ")" +
", len: " +
Integer.toHexString(len[i]) +
" (" + len[i] + ")"
);
}
}
if(startType[i] == null) {
System.out.println("(not present)");
} else {
System.out.println(
"\t" +
startType[i] + " " +
optA[i] + " " +
optB[i] + " " +
optC[i]
);
System.out.println(
"\t" +
endType[i] + " " +
"from: " +
Integer.toHexString(from[i]) +
" (" + from[i] + ")" +
", len: " +
Integer.toHexString(len[i]) +
" (" + len[i] + ")"
);
}
}
// Text
System.out.println();
System.out.println("TEXT:");
System.out.println(text);
System.out.println();
// Text
System.out.println();
System.out.println("TEXT:");
System.out.println(text);
System.out.println();
// All the others
for(int i=0; i<20; i++) {
if(startType[i] == null) {
continue;
}
int start = from[i];
// All the others
for(int i=0; i<20; i++) {
if(startType[i] == null) {
continue;
}
int start = from[i];
System.out.println(
startType[i] + " -> " + endType[i] +
" @ " + Integer.toHexString(start) +
" (" + start + ")"
);
System.out.println("\t" + dumpBytes(data, start, 4));
System.out.println("\t" + dumpBytes(data, start+4, 4));
System.out.println("\t" + dumpBytes(data, start+8, 4));
System.out.println("\t(etc)");
}
}
System.out.println(
startType[i] + " -> " + endType[i] +
" @ " + Integer.toHexString(start) +
" (" + start + ")"
);
System.out.println("\t" + dumpBytes(data, start, 4));
System.out.println("\t" + dumpBytes(data, start+4, 4));
System.out.println("\t" + dumpBytes(data, start+8, 4));
System.out.println("\t(etc)");
}
}
protected void dump001CompObj(DirectoryNode dir) {
// TODO
}
protected void dump001CompObj(DirectoryNode dir) {
// TODO
}
public void dumpQuill() throws IOException {
DirectoryNode quillDir = (DirectoryNode)
fs.getRoot().getEntry("Quill");
DirectoryNode quillSubDir = (DirectoryNode)
quillDir.getEntry("QuillSub");
public void dumpQuill() throws IOException {
DirectoryNode quillDir = (DirectoryNode)
fs.getRoot().getEntry("Quill");
DirectoryNode quillSubDir = (DirectoryNode)
quillDir.getEntry("QuillSub");
dump001CompObj(quillSubDir);
dumpCONTENTSraw(quillSubDir);
dumpCONTENTSguessed(quillSubDir);
}
dump001CompObj(quillSubDir);
dumpCONTENTSraw(quillSubDir);
dumpCONTENTSguessed(quillSubDir);
}
}

View File

@ -33,53 +33,53 @@ import org.apache.poi.util.HexDump;
* what the format of them is.
*/
public final class PLCDumper {
private HPBFDocument doc;
private QuillContents qc;
private HPBFDocument doc;
private QuillContents qc;
public PLCDumper(HPBFDocument hpbfDoc) {
doc = hpbfDoc;
qc = doc.getQuillContents();
}
public PLCDumper(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs));
}
public PLCDumper(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
public PLCDumper(HPBFDocument hpbfDoc) {
doc = hpbfDoc;
qc = doc.getQuillContents();
}
public PLCDumper(POIFSFileSystem fs) throws IOException {
this(new HPBFDocument(fs));
}
public PLCDumper(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" PLCDumper <filename>");
System.exit(1);
}
public static void main(String[] args) throws Exception {
if(args.length < 1) {
System.err.println("Use:");
System.err.println(" PLCDumper <filename>");
System.exit(1);
}
try (FileInputStream fis = new FileInputStream(args[0])) {
PLCDumper dump = new PLCDumper(fis);
try (FileInputStream fis = new FileInputStream(args[0])) {
PLCDumper dump = new PLCDumper(fis);
System.out.println("Dumping " + args[0]);
dump.dumpPLC();
}
}
System.out.println("Dumping " + args[0]);
dump.dumpPLC();
}
}
private void dumpPLC() {
QCBit[] bits = qc.getBits();
private void dumpPLC() {
QCBit[] bits = qc.getBits();
for(int i=0; i<bits.length; i++) {
if(bits[i] == null) continue;
if(bits[i].getBitType().equals("PLC ")) {
dumpBit(bits[i], i);
}
}
}
for(int i=0; i<bits.length; i++) {
if(bits[i] == null) continue;
if(bits[i].getBitType().equals("PLC ")) {
dumpBit(bits[i], i);
}
}
}
private void dumpBit(QCBit bit, int index) {
System.out.println();
System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
System.out.println(" Is a " + bit.getThingType() + ", number is " + bit.getOptA());
System.out.println(" Starts at " + bit.getDataOffset() + " (0x" + Integer.toHexString(bit.getDataOffset()) + ")");
System.out.println(" Runs for " + bit.getLength() + " (0x" + Integer.toHexString(bit.getLength()) + ")");
private void dumpBit(QCBit bit, int index) {
System.out.println();
System.out.println("Dumping " + bit.getBitType() + " bit at " + index);
System.out.println(" Is a " + bit.getThingType() + ", number is " + bit.getOptA());
System.out.println(" Starts at " + bit.getDataOffset() + " (0x" + Integer.toHexString(bit.getDataOffset()) + ")");
System.out.println(" Runs for " + bit.getLength() + " (0x" + Integer.toHexString(bit.getLength()) + ")");
System.out.println(HexDump.dump(bit.getData(), 0, 0));
}
System.out.println(HexDump.dump(bit.getData(), 0, 0));
}
}

View File

@ -50,65 +50,65 @@ public final class PublisherTextExtractor extends POIOLE2TextExtractor {
this(new POIFSFileSystem(is));
}
/**
* Should a call to getText() return hyperlinks inline
* with the text?
* Default is no
*/
public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
this.hyperlinksByDefault = hyperlinksByDefault;
}
/**
* Should a call to getText() return hyperlinks inline
* with the text?
* Default is no
*/
public void setHyperlinksByDefault(boolean hyperlinksByDefault) {
this.hyperlinksByDefault = hyperlinksByDefault;
}
public String getText() {
StringBuilder text = new StringBuilder();
public String getText() {
StringBuilder text = new StringBuilder();
// Get the text from the Quill Contents
QCBit[] bits = doc.getQuillContents().getBits();
for (QCBit bit1 : bits) {
if (bit1 != null && bit1 instanceof QCTextBit) {
QCTextBit t = (QCTextBit) bit1;
text.append(t.getText().replace('\r', '\n'));
}
}
// Get the text from the Quill Contents
QCBit[] bits = doc.getQuillContents().getBits();
for (QCBit bit1 : bits) {
if (bit1 != null && bit1 instanceof QCTextBit) {
QCTextBit t = (QCTextBit) bit1;
text.append(t.getText().replace('\r', '\n'));
}
}
// If requested, add in the hyperlinks
// Ideally, we'd do these inline, but the hyperlink
// positions are relative to the text area the
// hyperlink is in, and we have yet to figure out
// how to tie that together.
if(hyperlinksByDefault) {
for (QCBit bit : bits) {
if (bit != null && bit instanceof Type12) {
Type12 hyperlinks = (Type12) bit;
for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) {
text.append("<");
text.append(hyperlinks.getHyperlink(j));
text.append(">\n");
}
}
}
}
// If requested, add in the hyperlinks
// Ideally, we'd do these inline, but the hyperlink
// positions are relative to the text area the
// hyperlink is in, and we have yet to figure out
// how to tie that together.
if(hyperlinksByDefault) {
for (QCBit bit : bits) {
if (bit != null && bit instanceof Type12) {
Type12 hyperlinks = (Type12) bit;
for (int j = 0; j < hyperlinks.getNumberOfHyperlinks(); j++) {
text.append("<");
text.append(hyperlinks.getHyperlink(j));
text.append(">\n");
}
}
}
}
// Get more text
// TODO
// Get more text
// TODO
return text.toString();
}
return text.toString();
}
public static void main(String[] args) throws Exception {
if(args.length == 0) {
System.err.println("Use:");
System.err.println(" PublisherTextExtractor <file.pub>");
}
public static void main(String[] args) throws Exception {
if(args.length == 0) {
System.err.println("Use:");
System.err.println(" PublisherTextExtractor <file.pub>");
}
for (String arg : args) {
try (FileInputStream fis = new FileInputStream(arg)) {
PublisherTextExtractor te = new PublisherTextExtractor(fis);
System.out.println(te.getText());
te.close();
}
}
}
for (String arg : args) {
try (FileInputStream fis = new FileInputStream(arg)) {
PublisherTextExtractor te = new PublisherTextExtractor(fis);
System.out.println(te.getText());
te.close();
}
}
}
}

View File

@ -53,158 +53,158 @@ import org.apache.poi.util.LittleEndian;
* lucene indexers) that would ever want to use this!
*/
public final class QuickButCruddyTextExtractor {
private POIFSFileSystem fs;
private InputStream is;
private byte[] pptContents;
private POIFSFileSystem fs;
private InputStream is;
private byte[] pptContents;
/**
* Really basic text extractor, that will also return lots of crud text.
* Takes a single argument, the file to extract from
*/
public static void main(String[] args) throws IOException
{
if(args.length < 1) {
System.err.println("Useage:");
System.err.println("\tQuickButCruddyTextExtractor <file>");
System.exit(1);
}
/**
* Really basic text extractor, that will also return lots of crud text.
* Takes a single argument, the file to extract from
*/
public static void main(String[] args) throws IOException
{
if(args.length < 1) {
System.err.println("Useage:");
System.err.println("\tQuickButCruddyTextExtractor <file>");
System.exit(1);
}
String file = args[0];
String file = args[0];
QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file);
System.out.println(ppe.getTextAsString());
ppe.close();
}
QuickButCruddyTextExtractor ppe = new QuickButCruddyTextExtractor(file);
System.out.println(ppe.getTextAsString());
ppe.close();
}
/**
* Creates an extractor from a given file name
* @param fileName
*/
@SuppressWarnings("resource")
public QuickButCruddyTextExtractor(String fileName) throws IOException {
this(new POIFSFileSystem(new File(fileName)));
}
/**
* Creates an extractor from a given input stream
* @param iStream
*/
/**
* Creates an extractor from a given file name
* @param fileName
*/
@SuppressWarnings("resource")
public QuickButCruddyTextExtractor(InputStream iStream) throws IOException {
this(new POIFSFileSystem(iStream));
is = iStream;
}
public QuickButCruddyTextExtractor(String fileName) throws IOException {
this(new POIFSFileSystem(new File(fileName)));
}
/**
* Creates an extractor from a POIFS Filesystem
* @param poifs
*/
public QuickButCruddyTextExtractor(POIFSFileSystem poifs) throws IOException {
fs = poifs;
/**
* Creates an extractor from a given input stream
* @param iStream
*/
@SuppressWarnings("resource")
public QuickButCruddyTextExtractor(InputStream iStream) throws IOException {
this(new POIFSFileSystem(iStream));
is = iStream;
}
// Find the PowerPoint bit, and get out the bytes
InputStream pptIs = fs.createDocumentInputStream(HSLFSlideShow.POWERPOINT_DOCUMENT);
pptContents = IOUtils.toByteArray(pptIs);
pptIs.close();
}
/**
* Creates an extractor from a POIFS Filesystem
* @param poifs
*/
public QuickButCruddyTextExtractor(POIFSFileSystem poifs) throws IOException {
fs = poifs;
// Find the PowerPoint bit, and get out the bytes
InputStream pptIs = fs.createDocumentInputStream(HSLFSlideShow.POWERPOINT_DOCUMENT);
pptContents = IOUtils.toByteArray(pptIs);
pptIs.close();
}
/**
* Shuts down the underlying streams
*/
public void close() throws IOException {
if(is != null) { is.close(); }
fs = null;
}
/**
* Shuts down the underlying streams
*/
public void close() throws IOException {
if(is != null) { is.close(); }
fs = null;
}
/**
* Fetches the ALL the text of the powerpoint file, as a single string
*/
public String getTextAsString() {
StringBuilder ret = new StringBuilder();
List<String> textV = getTextAsVector();
for(String text : textV) {
ret.append(text);
if(! text.endsWith("\n")) {
ret.append('\n');
}
}
return ret.toString();
}
/**
* Fetches the ALL the text of the powerpoint file, as a single string
*/
public String getTextAsString() {
StringBuilder ret = new StringBuilder();
List<String> textV = getTextAsVector();
for(String text : textV) {
ret.append(text);
if(! text.endsWith("\n")) {
ret.append('\n');
}
}
return ret.toString();
}
/**
* Fetches the ALL the text of the powerpoint file, in a List of
* strings, one per text record
*/
public List<String> getTextAsVector() {
List<String> textV = new ArrayList<>();
/**
* Fetches the ALL the text of the powerpoint file, in a List of
* strings, one per text record
*/
public List<String> getTextAsVector() {
List<String> textV = new ArrayList<>();
// Set to the start of the file
int walkPos = 0;
// Set to the start of the file
int walkPos = 0;
// Start walking the file, looking for the records
while(walkPos != -1) {
// Start walking the file, looking for the records
while(walkPos != -1) {
walkPos = findTextRecords(walkPos,textV);
}
}
// Return what we find
return textV;
}
// Return what we find
return textV;
}
/**
* For the given position, look if the record is a text record, and wind
* on after.
* If it is a text record, grabs out the text. Whatever happens, returns
* the position of the next record, or -1 if no more.
*/
public int findTextRecords(int startPos, List<String> textV) {
// Grab the length, and the first option byte
// Note that the length doesn't include the 8 byte atom header
int len = (int)LittleEndian.getUInt(pptContents,startPos+4);
byte opt = pptContents[startPos];
/**
* For the given position, look if the record is a text record, and wind
* on after.
* If it is a text record, grabs out the text. Whatever happens, returns
* the position of the next record, or -1 if no more.
*/
public int findTextRecords(int startPos, List<String> textV) {
// Grab the length, and the first option byte
// Note that the length doesn't include the 8 byte atom header
int len = (int)LittleEndian.getUInt(pptContents,startPos+4);
byte opt = pptContents[startPos];
// If it's a container, step into it and return
// (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
int container = opt & 0x0f;
if(container == 0x0f) {
return (startPos+8);
}
// If it's a container, step into it and return
// (If it's a container, option byte 1 BINARY_AND 0x0f will be 0x0f)
int container = opt & 0x0f;
if(container == 0x0f) {
return (startPos+8);
}
// Otherwise, check the type to see if it's text
int type = LittleEndian.getUShort(pptContents,startPos+2);
// Otherwise, check the type to see if it's text
int type = LittleEndian.getUShort(pptContents,startPos+2);
// TextBytesAtom
if(type == RecordTypes.TextBytesAtom.typeID) {
TextBytesAtom tba = (TextBytesAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
String text = HSLFTextParagraph.toExternalString(tba.getText(), -1);
textV.add(text);
}
// TextCharsAtom
if(type == RecordTypes.TextCharsAtom.typeID) {
TextCharsAtom tca = (TextCharsAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
// TextBytesAtom
if(type == RecordTypes.TextBytesAtom.typeID) {
TextBytesAtom tba = (TextBytesAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
String text = HSLFTextParagraph.toExternalString(tba.getText(), -1);
textV.add(text);
}
// TextCharsAtom
if(type == RecordTypes.TextCharsAtom.typeID) {
TextCharsAtom tca = (TextCharsAtom)Record.createRecordForType(type, pptContents, startPos, len+8);
String text = HSLFTextParagraph.toExternalString(tca.getText(), -1);
textV.add(text);
}
}
// CString (doesn't go via a TextRun)
if(type == RecordTypes.CString.typeID) {
CString cs = (CString)Record.createRecordForType(type, pptContents, startPos, len+8);
String text = cs.getText();
// CString (doesn't go via a TextRun)
if(type == RecordTypes.CString.typeID) {
CString cs = (CString)Record.createRecordForType(type, pptContents, startPos, len+8);
String text = cs.getText();
// Ignore the ones we know to be rubbish
if(text.equals("___PPT10")) {
} else if(text.equals("Default Design")) {
} else {
textV.add(text);
}
}
// Ignore the ones we know to be rubbish
if(text.equals("___PPT10")) {
} else if(text.equals("Default Design")) {
} else {
textV.add(text);
}
}
// Wind on by the atom length, and check we're not at the end
int newPos = (startPos + 8 + len);
if(newPos > (pptContents.length - 8)) {
newPos = -1;
}
return newPos;
}
// Wind on by the atom length, and check we're not at the end
int newPos = (startPos + 8 + len);
if(newPos > (pptContents.length - 8)) {
newPos = -1;
}
return newPos;
}
}