mirror of https://github.com/apache/poi.git
Lots more HDGF support for chunks, and add support for basic text extraction
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@551258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
296705e0b6
commit
97fb171369
|
@ -20,6 +20,7 @@ import java.util.ArrayList;
|
||||||
|
|
||||||
import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
|
import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
|
||||||
import org.apache.poi.util.LittleEndian;
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
import org.apache.poi.util.StringUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base of all chunks, which hold data, flags etc
|
* Base of all chunks, which hold data, flags etc
|
||||||
|
@ -167,6 +168,24 @@ public class Chunk {
|
||||||
LittleEndian.getDouble(contents, offset)
|
LittleEndian.getDouble(contents, offset)
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
|
case 12:
|
||||||
|
// A Little Endian String
|
||||||
|
// Starts 8 bytes into the data segment
|
||||||
|
// Ends at end of data, or 00 00
|
||||||
|
int startsAt = 8;
|
||||||
|
int endsAt = startsAt;
|
||||||
|
for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) {
|
||||||
|
if(contents[j] == 0 && contents[j+1] == 0) {
|
||||||
|
endsAt = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(endsAt == startsAt) {
|
||||||
|
endsAt = contents.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
int strLen = (endsAt-startsAt) / 2;
|
||||||
|
command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen);
|
||||||
|
break;
|
||||||
case 25:
|
case 25:
|
||||||
command.value = new Short(
|
command.value = new Short(
|
||||||
LittleEndian.getShort(contents, offset)
|
LittleEndian.getShort(contents, offset)
|
||||||
|
|
|
@ -24,6 +24,10 @@ public class ChunkHeaderV11 extends ChunkHeaderV6 {
|
||||||
* Does the chunk have a separator?
|
* Does the chunk have a separator?
|
||||||
*/
|
*/
|
||||||
public boolean hasSeparator() {
|
public boolean hasSeparator() {
|
||||||
|
// For some reason, there are two types that don't have a
|
||||||
|
// separator despite the flags that indicate they do
|
||||||
|
if(type == 0x1f || type == 0xc9) { return false; }
|
||||||
|
|
||||||
// If there's a trailer, there's a separator
|
// If there's a trailer, there's a separator
|
||||||
if(hasTrailer()) { return true; }
|
if(hasTrailer()) { return true; }
|
||||||
|
|
||||||
|
|
|
@ -27,4 +27,8 @@ public class ChunkSeparator {
|
||||||
separatorData = new byte[4];
|
separatorData = new byte[4];
|
||||||
System.arraycopy(data, offset, separatorData, 0, 4);
|
System.arraycopy(data, offset, separatorData, 0, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "<ChunkSeparator of length " + separatorData.length + ">";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,4 +26,8 @@ public class ChunkTrailer {
|
||||||
trailerData = new byte[8];
|
trailerData = new byte[8];
|
||||||
System.arraycopy(data, offset, trailerData, 0, 8);
|
System.arraycopy(data, offset, trailerData, 0, 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "<ChunkTrailer of length " + trailerData.length + ">";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,6 +70,11 @@ public class VSDDumper {
|
||||||
" - " + Integer.toHexString(ptr.getFormat()));
|
" - " + Integer.toHexString(ptr.getFormat()));
|
||||||
System.out.println(ind + " Length is\t" + ptr.getLength() +
|
System.out.println(ind + " Length is\t" + ptr.getLength() +
|
||||||
" - " + Integer.toHexString(ptr.getLength()));
|
" - " + Integer.toHexString(ptr.getLength()));
|
||||||
|
if(ptr.destinationCompressed()) {
|
||||||
|
int decompLen = stream._getContentsLength();
|
||||||
|
System.out.println(ind + " DC.Length is\t" + decompLen +
|
||||||
|
" - " + Integer.toHexString(decompLen));
|
||||||
|
}
|
||||||
System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed());
|
System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed());
|
||||||
System.out.println(ind + " Stream is\t" + stream.getClass().getName());
|
System.out.println(ind + " Stream is\t" + stream.getClass().getName());
|
||||||
|
|
||||||
|
@ -100,6 +105,9 @@ public class VSDDumper {
|
||||||
for(int i=0; i<cs.getChunks().length; i++) {
|
for(int i=0; i<cs.getChunks().length; i++) {
|
||||||
Chunk chunk = cs.getChunks()[i];
|
Chunk chunk = cs.getChunks()[i];
|
||||||
System.out.println(ind2 + "" + chunk.getName());
|
System.out.println(ind2 + "" + chunk.getName());
|
||||||
|
System.out.println(ind2 + " Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")");
|
||||||
|
System.out.println(ind2 + " OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")");
|
||||||
|
System.out.println(ind2 + " T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator());
|
||||||
System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands");
|
System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands");
|
||||||
for(int j=0; j<chunk.getCommands().length; j++) {
|
for(int j=0; j<chunk.getCommands().length; j++) {
|
||||||
Command command = chunk.getCommands()[j];
|
Command command = chunk.getCommands()[j];
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hdgf.extractor;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
import org.apache.poi.hdgf.HDGFDiagram;
|
||||||
|
import org.apache.poi.hdgf.chunks.Chunk.Command;
|
||||||
|
import org.apache.poi.hdgf.streams.ChunkStream;
|
||||||
|
import org.apache.poi.hdgf.streams.PointerContainingStream;
|
||||||
|
import org.apache.poi.hdgf.streams.Stream;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to find all the text in a Visio file, and return it.
|
||||||
|
* Can opperate on the command line (outputs to stdout), or
|
||||||
|
* can return the text for you (eg for use with Lucene).
|
||||||
|
*/
|
||||||
|
public class VisioTextExtractor {
|
||||||
|
private HDGFDiagram hdgf;
|
||||||
|
private POIFSFileSystem fs;
|
||||||
|
|
||||||
|
public VisioTextExtractor(HDGFDiagram hdgf) {
|
||||||
|
this.hdgf = hdgf;
|
||||||
|
}
|
||||||
|
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||||
|
this(new HDGFDiagram(fs));
|
||||||
|
this.fs = fs;
|
||||||
|
}
|
||||||
|
public VisioTextExtractor(InputStream inp) throws IOException {
|
||||||
|
this(new POIFSFileSystem(inp));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Locates all the text entries in the file, and returns their
|
||||||
|
* contents.
|
||||||
|
*/
|
||||||
|
public String[] getAllText() {
|
||||||
|
ArrayList text = new ArrayList();
|
||||||
|
for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
|
||||||
|
findText(hdgf.getTopLevelStreams()[i], text);
|
||||||
|
}
|
||||||
|
System.err.println("Found " + text.size() + " text string");
|
||||||
|
return (String[])text.toArray( new String[text.size()] );
|
||||||
|
}
|
||||||
|
private void findText(Stream stream, ArrayList text) {
|
||||||
|
if(stream instanceof PointerContainingStream) {
|
||||||
|
PointerContainingStream ps = (PointerContainingStream)stream;
|
||||||
|
for(int i=0; i<ps.getPointedToStreams().length; i++) {
|
||||||
|
findText(ps.getPointedToStreams()[i], text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(stream instanceof ChunkStream) {
|
||||||
|
ChunkStream cs = (ChunkStream)stream;
|
||||||
|
for(int i=0; i<cs.getChunks().length; i++) {
|
||||||
|
if(cs.getChunks()[i] != null &&
|
||||||
|
cs.getChunks()[i].getName() != null &&
|
||||||
|
cs.getChunks()[i].getName().equals("Text")) {
|
||||||
|
// First command
|
||||||
|
Command cmd = cs.getChunks()[i].getCommands()[0];
|
||||||
|
if(cmd != null && cmd.getValue() != null) {
|
||||||
|
text.add( cmd.getValue().toString() );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the textual contents of the file.
|
||||||
|
*/
|
||||||
|
public String getText() {
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
String[] allText = getAllText();
|
||||||
|
for(int i=0; i<allText.length; i++) {
|
||||||
|
text.append(allText[i]);
|
||||||
|
if(!allText[i].endsWith("\r") &&
|
||||||
|
!allText[i].endsWith("\n")) {
|
||||||
|
text.append("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if(args.length == 0) {
|
||||||
|
System.err.println("Use:");
|
||||||
|
System.err.println(" VisioTextExtractor <file.vsd>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
VisioTextExtractor extractor =
|
||||||
|
new VisioTextExtractor(new FileInputStream(args[0]));
|
||||||
|
System.out.println(extractor.getText());
|
||||||
|
}
|
||||||
|
}
|
|
@ -43,6 +43,11 @@ public class ChunkStream extends Stream {
|
||||||
public void findChunks() {
|
public void findChunks() {
|
||||||
ArrayList chunksA = new ArrayList();
|
ArrayList chunksA = new ArrayList();
|
||||||
|
|
||||||
|
if(getPointer().getOffset() == 0x64b3) {
|
||||||
|
int i = 0;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
byte[] contents = getStore().getContents();
|
byte[] contents = getStore().getContents();
|
||||||
while(pos < contents.length) {
|
while(pos < contents.length) {
|
||||||
|
|
|
@ -83,7 +83,7 @@ public abstract class Stream {
|
||||||
return new ChunkStream(pointer, store, chunkFactory);
|
return new ChunkStream(pointer, store, chunkFactory);
|
||||||
}
|
}
|
||||||
else if(pointer.destinationHasStrings()) {
|
else if(pointer.destinationHasStrings()) {
|
||||||
return new StringsStream(pointer, store);
|
return new StringsStream(pointer, store, chunkFactory);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Give up and return a generic one
|
// Give up and return a generic one
|
||||||
|
|
|
@ -16,13 +16,16 @@ limitations under the License.
|
||||||
==================================================================== */
|
==================================================================== */
|
||||||
package org.apache.poi.hdgf.streams;
|
package org.apache.poi.hdgf.streams;
|
||||||
|
|
||||||
|
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
||||||
import org.apache.poi.hdgf.pointers.Pointer;
|
import org.apache.poi.hdgf.pointers.Pointer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A Stream which holds Strings
|
* A Stream which holds Strings. This is just another kind
|
||||||
|
* of ChunkStream, it seems
|
||||||
*/
|
*/
|
||||||
public class StringsStream extends Stream {
|
public class StringsStream extends Stream {
|
||||||
protected StringsStream(Pointer pointer, StreamStore store) {
|
protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) {
|
||||||
super(pointer, store);
|
super(pointer, store);
|
||||||
|
// super(pointer, store, chunkFactory);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams;
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
|
|
||||||
|
import org.apache.poi.hdgf.chunks.Chunk;
|
||||||
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
||||||
import org.apache.poi.hdgf.pointers.Pointer;
|
import org.apache.poi.hdgf.pointers.Pointer;
|
||||||
import org.apache.poi.hdgf.pointers.PointerFactory;
|
import org.apache.poi.hdgf.pointers.PointerFactory;
|
||||||
|
@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest {
|
||||||
assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
|
assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
|
||||||
assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
|
assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testChunkWithText() throws Exception {
|
||||||
|
// Parent ChunkStream is at 0x7194
|
||||||
|
// This is one of the last children of the trailer
|
||||||
|
Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt);
|
||||||
|
TrailerStream ts = (TrailerStream)
|
||||||
|
Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory);
|
||||||
|
|
||||||
|
ts.findChildren(contents);
|
||||||
|
|
||||||
|
assertNotNull(ts.getChildPointers());
|
||||||
|
assertNotNull(ts.getPointedToStreams());
|
||||||
|
assertEquals(20, ts.getChildPointers().length);
|
||||||
|
assertEquals(20, ts.getPointedToStreams().length);
|
||||||
|
|
||||||
|
assertEquals(0x7194, ts.getChildPointers()[13].getOffset());
|
||||||
|
assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset());
|
||||||
|
|
||||||
|
PointerContainingStream ps7194 = (PointerContainingStream)
|
||||||
|
ts.getPointedToStreams()[13];
|
||||||
|
|
||||||
|
// First child is at 0x64b3
|
||||||
|
assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset());
|
||||||
|
assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset());
|
||||||
|
|
||||||
|
ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0];
|
||||||
|
|
||||||
|
// Should be 26bc bytes un-compressed
|
||||||
|
assertEquals(0x26bc, cs.getStore().getContents().length);
|
||||||
|
// And should have lots of children
|
||||||
|
assertEquals(131, cs.getChunks().length);
|
||||||
|
|
||||||
|
// One of which is Text
|
||||||
|
boolean hasText = false;
|
||||||
|
for(int i=0; i<cs.getChunks().length; i++) {
|
||||||
|
if(cs.getChunks()[i].getName().equals("Text")) {
|
||||||
|
hasText = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertTrue(hasText);
|
||||||
|
// Which is the 72nd command
|
||||||
|
assertEquals("Text", cs.getChunks()[72].getName());
|
||||||
|
|
||||||
|
Chunk text = cs.getChunks()[72];
|
||||||
|
assertEquals("Text", text.getName());
|
||||||
|
|
||||||
|
// Which contains our text
|
||||||
|
assertEquals(1, text.getCommands().length);
|
||||||
|
assertEquals("Test View\n", text.getCommands()[0].getValue());
|
||||||
|
|
||||||
|
|
||||||
|
// Almost at the end is some more text
|
||||||
|
assertEquals("Text", cs.getChunks()[128].getName());
|
||||||
|
text = cs.getChunks()[128];
|
||||||
|
assertEquals("Text", text.getName());
|
||||||
|
|
||||||
|
assertEquals(1, text.getCommands().length);
|
||||||
|
assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue