Lots more HDGF support for chunks, and add support for basic text extraction

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@551258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2007-06-27 18:34:17 +00:00
parent 296705e0b6
commit 97fb171369
10 changed files with 224 additions and 4 deletions

View File

@ -20,6 +20,7 @@ import java.util.ArrayList;
import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition; import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
import org.apache.poi.util.LittleEndian; import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
/** /**
* Base of all chunks, which hold data, flags etc * Base of all chunks, which hold data, flags etc
@ -167,6 +168,24 @@ public class Chunk {
LittleEndian.getDouble(contents, offset) LittleEndian.getDouble(contents, offset)
); );
break; break;
case 12:
// A Little Endian String
// Starts 8 bytes into the data segment
// Ends at end of data, or 00 00
int startsAt = 8;
int endsAt = startsAt;
for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) {
if(contents[j] == 0 && contents[j+1] == 0) {
endsAt = j;
}
}
if(endsAt == startsAt) {
endsAt = contents.length;
}
int strLen = (endsAt-startsAt) / 2;
command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen);
break;
case 25: case 25:
command.value = new Short( command.value = new Short(
LittleEndian.getShort(contents, offset) LittleEndian.getShort(contents, offset)

View File

@ -24,6 +24,10 @@ public class ChunkHeaderV11 extends ChunkHeaderV6 {
* Does the chunk have a separator? * Does the chunk have a separator?
*/ */
public boolean hasSeparator() { public boolean hasSeparator() {
// For some reason, there are two types that don't have a
// separator despite the flags that indicate they do
if(type == 0x1f || type == 0xc9) { return false; }
// If there's a trailer, there's a separator // If there's a trailer, there's a separator
if(hasTrailer()) { return true; } if(hasTrailer()) { return true; }

View File

@ -27,4 +27,8 @@ public class ChunkSeparator {
separatorData = new byte[4]; separatorData = new byte[4];
System.arraycopy(data, offset, separatorData, 0, 4); System.arraycopy(data, offset, separatorData, 0, 4);
} }
public String toString() {
return "<ChunkSeparator of length " + separatorData.length + ">";
}
} }

View File

@ -26,4 +26,8 @@ public class ChunkTrailer {
trailerData = new byte[8]; trailerData = new byte[8];
System.arraycopy(data, offset, trailerData, 0, 8); System.arraycopy(data, offset, trailerData, 0, 8);
} }
public String toString() {
return "<ChunkTrailer of length " + trailerData.length + ">";
}
} }

View File

@ -70,6 +70,11 @@ public class VSDDumper {
" - " + Integer.toHexString(ptr.getFormat())); " - " + Integer.toHexString(ptr.getFormat()));
System.out.println(ind + " Length is\t" + ptr.getLength() + System.out.println(ind + " Length is\t" + ptr.getLength() +
" - " + Integer.toHexString(ptr.getLength())); " - " + Integer.toHexString(ptr.getLength()));
if(ptr.destinationCompressed()) {
int decompLen = stream._getContentsLength();
System.out.println(ind + " DC.Length is\t" + decompLen +
" - " + Integer.toHexString(decompLen));
}
System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed()); System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed());
System.out.println(ind + " Stream is\t" + stream.getClass().getName()); System.out.println(ind + " Stream is\t" + stream.getClass().getName());
@ -100,6 +105,9 @@ public class VSDDumper {
for(int i=0; i<cs.getChunks().length; i++) { for(int i=0; i<cs.getChunks().length; i++) {
Chunk chunk = cs.getChunks()[i]; Chunk chunk = cs.getChunks()[i];
System.out.println(ind2 + "" + chunk.getName()); System.out.println(ind2 + "" + chunk.getName());
System.out.println(ind2 + " Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")");
System.out.println(ind2 + " OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")");
System.out.println(ind2 + " T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator());
System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands"); System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands");
for(int j=0; j<chunk.getCommands().length; j++) { for(int j=0; j<chunk.getCommands().length; j++) {
Command command = chunk.getCommands()[j]; Command command = chunk.getCommands()[j];

View File

@ -0,0 +1,113 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hdgf.extractor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.poi.hdgf.HDGFDiagram;
import org.apache.poi.hdgf.chunks.Chunk.Command;
import org.apache.poi.hdgf.streams.ChunkStream;
import org.apache.poi.hdgf.streams.PointerContainingStream;
import org.apache.poi.hdgf.streams.Stream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Class to find all the text in a Visio file, and return it.
* Can opperate on the command line (outputs to stdout), or
* can return the text for you (eg for use with Lucene).
*/
public class VisioTextExtractor {
private HDGFDiagram hdgf;
private POIFSFileSystem fs;
public VisioTextExtractor(HDGFDiagram hdgf) {
this.hdgf = hdgf;
}
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
this(new HDGFDiagram(fs));
this.fs = fs;
}
public VisioTextExtractor(InputStream inp) throws IOException {
this(new POIFSFileSystem(inp));
}
/**
* Locates all the text entries in the file, and returns their
* contents.
*/
public String[] getAllText() {
ArrayList text = new ArrayList();
for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
findText(hdgf.getTopLevelStreams()[i], text);
}
System.err.println("Found " + text.size() + " text string");
return (String[])text.toArray( new String[text.size()] );
}
private void findText(Stream stream, ArrayList text) {
if(stream instanceof PointerContainingStream) {
PointerContainingStream ps = (PointerContainingStream)stream;
for(int i=0; i<ps.getPointedToStreams().length; i++) {
findText(ps.getPointedToStreams()[i], text);
}
}
if(stream instanceof ChunkStream) {
ChunkStream cs = (ChunkStream)stream;
for(int i=0; i<cs.getChunks().length; i++) {
if(cs.getChunks()[i] != null &&
cs.getChunks()[i].getName() != null &&
cs.getChunks()[i].getName().equals("Text")) {
// First command
Command cmd = cs.getChunks()[i].getCommands()[0];
if(cmd != null && cmd.getValue() != null) {
text.add( cmd.getValue().toString() );
}
}
}
}
}
/**
* Returns the textual contents of the file.
*/
public String getText() {
StringBuffer text = new StringBuffer();
String[] allText = getAllText();
for(int i=0; i<allText.length; i++) {
text.append(allText[i]);
if(!allText[i].endsWith("\r") &&
!allText[i].endsWith("\n")) {
text.append("\n");
}
}
return text.toString();
}
public static void main(String[] args) throws Exception {
if(args.length == 0) {
System.err.println("Use:");
System.err.println(" VisioTextExtractor <file.vsd>");
System.exit(1);
}
VisioTextExtractor extractor =
new VisioTextExtractor(new FileInputStream(args[0]));
System.out.println(extractor.getText());
}
}

View File

@ -43,6 +43,11 @@ public class ChunkStream extends Stream {
public void findChunks() { public void findChunks() {
ArrayList chunksA = new ArrayList(); ArrayList chunksA = new ArrayList();
if(getPointer().getOffset() == 0x64b3) {
int i = 0;
i++;
}
int pos = 0; int pos = 0;
byte[] contents = getStore().getContents(); byte[] contents = getStore().getContents();
while(pos < contents.length) { while(pos < contents.length) {

View File

@ -83,7 +83,7 @@ public abstract class Stream {
return new ChunkStream(pointer, store, chunkFactory); return new ChunkStream(pointer, store, chunkFactory);
} }
else if(pointer.destinationHasStrings()) { else if(pointer.destinationHasStrings()) {
return new StringsStream(pointer, store); return new StringsStream(pointer, store, chunkFactory);
} }
// Give up and return a generic one // Give up and return a generic one

View File

@ -16,13 +16,16 @@ limitations under the License.
==================================================================== */ ==================================================================== */
package org.apache.poi.hdgf.streams; package org.apache.poi.hdgf.streams;
import org.apache.poi.hdgf.chunks.ChunkFactory;
import org.apache.poi.hdgf.pointers.Pointer; import org.apache.poi.hdgf.pointers.Pointer;
/** /**
* A Stream which holds Strings * A Stream which holds Strings. This is just another kind
* of ChunkStream, it seems
*/ */
public class StringsStream extends Stream { public class StringsStream extends Stream {
protected StringsStream(Pointer pointer, StreamStore store) { protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) {
super(pointer, store); super(pointer, store);
// super(pointer, store, chunkFactory);
} }
} }

View File

@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams;
import java.io.FileInputStream; import java.io.FileInputStream;
import org.apache.poi.hdgf.chunks.Chunk;
import org.apache.poi.hdgf.chunks.ChunkFactory; import org.apache.poi.hdgf.chunks.ChunkFactory;
import org.apache.poi.hdgf.pointers.Pointer; import org.apache.poi.hdgf.pointers.Pointer;
import org.apache.poi.hdgf.pointers.PointerFactory; import org.apache.poi.hdgf.pointers.PointerFactory;
@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest {
assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream); assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream); assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
} }
public void testChunkWithText() throws Exception {
// Parent ChunkStream is at 0x7194
// This is one of the last children of the trailer
Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt);
TrailerStream ts = (TrailerStream)
Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory);
ts.findChildren(contents);
assertNotNull(ts.getChildPointers());
assertNotNull(ts.getPointedToStreams());
assertEquals(20, ts.getChildPointers().length);
assertEquals(20, ts.getPointedToStreams().length);
assertEquals(0x7194, ts.getChildPointers()[13].getOffset());
assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset());
PointerContainingStream ps7194 = (PointerContainingStream)
ts.getPointedToStreams()[13];
// First child is at 0x64b3
assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset());
assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset());
ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0];
// Should be 26bc bytes un-compressed
assertEquals(0x26bc, cs.getStore().getContents().length);
// And should have lots of children
assertEquals(131, cs.getChunks().length);
// One of which is Text
boolean hasText = false;
for(int i=0; i<cs.getChunks().length; i++) {
if(cs.getChunks()[i].getName().equals("Text")) {
hasText = true;
}
}
assertTrue(hasText);
// Which is the 72nd command
assertEquals("Text", cs.getChunks()[72].getName());
Chunk text = cs.getChunks()[72];
assertEquals("Text", text.getName());
// Which contains our text
assertEquals(1, text.getCommands().length);
assertEquals("Test View\n", text.getCommands()[0].getValue());
// Almost at the end is some more text
assertEquals("Text", cs.getChunks()[128].getName());
text = cs.getChunks()[128];
assertEquals("Text", text.getName());
assertEquals(1, text.getCommands().length);
assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue());
}
} }