mirror of https://github.com/apache/poi.git
Lots more HDGF support for chunks, and add support for basic text extraction
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@551258 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
296705e0b6
commit
97fb171369
|
@ -20,6 +20,7 @@ import java.util.ArrayList;
|
|||
|
||||
import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Base of all chunks, which hold data, flags etc
|
||||
|
@ -167,6 +168,24 @@ public class Chunk {
|
|||
LittleEndian.getDouble(contents, offset)
|
||||
);
|
||||
break;
|
||||
case 12:
|
||||
// A Little Endian String
|
||||
// Starts 8 bytes into the data segment
|
||||
// Ends at end of data, or 00 00
|
||||
int startsAt = 8;
|
||||
int endsAt = startsAt;
|
||||
for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) {
|
||||
if(contents[j] == 0 && contents[j+1] == 0) {
|
||||
endsAt = j;
|
||||
}
|
||||
}
|
||||
if(endsAt == startsAt) {
|
||||
endsAt = contents.length;
|
||||
}
|
||||
|
||||
int strLen = (endsAt-startsAt) / 2;
|
||||
command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen);
|
||||
break;
|
||||
case 25:
|
||||
command.value = new Short(
|
||||
LittleEndian.getShort(contents, offset)
|
||||
|
|
|
@ -24,6 +24,10 @@ public class ChunkHeaderV11 extends ChunkHeaderV6 {
|
|||
* Does the chunk have a separator?
|
||||
*/
|
||||
public boolean hasSeparator() {
|
||||
// For some reason, there are two types that don't have a
|
||||
// separator despite the flags that indicate they do
|
||||
if(type == 0x1f || type == 0xc9) { return false; }
|
||||
|
||||
// If there's a trailer, there's a separator
|
||||
if(hasTrailer()) { return true; }
|
||||
|
||||
|
|
|
@ -27,4 +27,8 @@ public class ChunkSeparator {
|
|||
separatorData = new byte[4];
|
||||
System.arraycopy(data, offset, separatorData, 0, 4);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "<ChunkSeparator of length " + separatorData.length + ">";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,4 +26,8 @@ public class ChunkTrailer {
|
|||
trailerData = new byte[8];
|
||||
System.arraycopy(data, offset, trailerData, 0, 8);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "<ChunkTrailer of length " + trailerData.length + ">";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,6 +70,11 @@ public class VSDDumper {
|
|||
" - " + Integer.toHexString(ptr.getFormat()));
|
||||
System.out.println(ind + " Length is\t" + ptr.getLength() +
|
||||
" - " + Integer.toHexString(ptr.getLength()));
|
||||
if(ptr.destinationCompressed()) {
|
||||
int decompLen = stream._getContentsLength();
|
||||
System.out.println(ind + " DC.Length is\t" + decompLen +
|
||||
" - " + Integer.toHexString(decompLen));
|
||||
}
|
||||
System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed());
|
||||
System.out.println(ind + " Stream is\t" + stream.getClass().getName());
|
||||
|
||||
|
@ -100,6 +105,9 @@ public class VSDDumper {
|
|||
for(int i=0; i<cs.getChunks().length; i++) {
|
||||
Chunk chunk = cs.getChunks()[i];
|
||||
System.out.println(ind2 + "" + chunk.getName());
|
||||
System.out.println(ind2 + " Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")");
|
||||
System.out.println(ind2 + " OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")");
|
||||
System.out.println(ind2 + " T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator());
|
||||
System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands");
|
||||
for(int j=0; j<chunk.getCommands().length; j++) {
|
||||
Command command = chunk.getCommands()[j];
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hdgf.extractor;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.poi.hdgf.HDGFDiagram;
|
||||
import org.apache.poi.hdgf.chunks.Chunk.Command;
|
||||
import org.apache.poi.hdgf.streams.ChunkStream;
|
||||
import org.apache.poi.hdgf.streams.PointerContainingStream;
|
||||
import org.apache.poi.hdgf.streams.Stream;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Class to find all the text in a Visio file, and return it.
|
||||
* Can opperate on the command line (outputs to stdout), or
|
||||
* can return the text for you (eg for use with Lucene).
|
||||
*/
|
||||
public class VisioTextExtractor {
|
||||
private HDGFDiagram hdgf;
|
||||
private POIFSFileSystem fs;
|
||||
|
||||
public VisioTextExtractor(HDGFDiagram hdgf) {
|
||||
this.hdgf = hdgf;
|
||||
}
|
||||
public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
|
||||
this(new HDGFDiagram(fs));
|
||||
this.fs = fs;
|
||||
}
|
||||
public VisioTextExtractor(InputStream inp) throws IOException {
|
||||
this(new POIFSFileSystem(inp));
|
||||
}
|
||||
|
||||
/**
|
||||
* Locates all the text entries in the file, and returns their
|
||||
* contents.
|
||||
*/
|
||||
public String[] getAllText() {
|
||||
ArrayList text = new ArrayList();
|
||||
for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
|
||||
findText(hdgf.getTopLevelStreams()[i], text);
|
||||
}
|
||||
System.err.println("Found " + text.size() + " text string");
|
||||
return (String[])text.toArray( new String[text.size()] );
|
||||
}
|
||||
private void findText(Stream stream, ArrayList text) {
|
||||
if(stream instanceof PointerContainingStream) {
|
||||
PointerContainingStream ps = (PointerContainingStream)stream;
|
||||
for(int i=0; i<ps.getPointedToStreams().length; i++) {
|
||||
findText(ps.getPointedToStreams()[i], text);
|
||||
}
|
||||
}
|
||||
if(stream instanceof ChunkStream) {
|
||||
ChunkStream cs = (ChunkStream)stream;
|
||||
for(int i=0; i<cs.getChunks().length; i++) {
|
||||
if(cs.getChunks()[i] != null &&
|
||||
cs.getChunks()[i].getName() != null &&
|
||||
cs.getChunks()[i].getName().equals("Text")) {
|
||||
// First command
|
||||
Command cmd = cs.getChunks()[i].getCommands()[0];
|
||||
if(cmd != null && cmd.getValue() != null) {
|
||||
text.add( cmd.getValue().toString() );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the textual contents of the file.
|
||||
*/
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
String[] allText = getAllText();
|
||||
for(int i=0; i<allText.length; i++) {
|
||||
text.append(allText[i]);
|
||||
if(!allText[i].endsWith("\r") &&
|
||||
!allText[i].endsWith("\n")) {
|
||||
text.append("\n");
|
||||
}
|
||||
}
|
||||
return text.toString();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if(args.length == 0) {
|
||||
System.err.println("Use:");
|
||||
System.err.println(" VisioTextExtractor <file.vsd>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
VisioTextExtractor extractor =
|
||||
new VisioTextExtractor(new FileInputStream(args[0]));
|
||||
System.out.println(extractor.getText());
|
||||
}
|
||||
}
|
|
@ -43,6 +43,11 @@ public class ChunkStream extends Stream {
|
|||
public void findChunks() {
|
||||
ArrayList chunksA = new ArrayList();
|
||||
|
||||
if(getPointer().getOffset() == 0x64b3) {
|
||||
int i = 0;
|
||||
i++;
|
||||
}
|
||||
|
||||
int pos = 0;
|
||||
byte[] contents = getStore().getContents();
|
||||
while(pos < contents.length) {
|
||||
|
|
|
@ -83,7 +83,7 @@ public abstract class Stream {
|
|||
return new ChunkStream(pointer, store, chunkFactory);
|
||||
}
|
||||
else if(pointer.destinationHasStrings()) {
|
||||
return new StringsStream(pointer, store);
|
||||
return new StringsStream(pointer, store, chunkFactory);
|
||||
}
|
||||
|
||||
// Give up and return a generic one
|
||||
|
|
|
@ -16,13 +16,16 @@ limitations under the License.
|
|||
==================================================================== */
|
||||
package org.apache.poi.hdgf.streams;
|
||||
|
||||
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
||||
import org.apache.poi.hdgf.pointers.Pointer;
|
||||
|
||||
/**
|
||||
* A Stream which holds Strings
|
||||
* A Stream which holds Strings. This is just another kind
|
||||
* of ChunkStream, it seems
|
||||
*/
|
||||
public class StringsStream extends Stream {
|
||||
protected StringsStream(Pointer pointer, StreamStore store) {
|
||||
protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) {
|
||||
super(pointer, store);
|
||||
// super(pointer, store, chunkFactory);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams;
|
|||
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.poi.hdgf.chunks.Chunk;
|
||||
import org.apache.poi.hdgf.chunks.ChunkFactory;
|
||||
import org.apache.poi.hdgf.pointers.Pointer;
|
||||
import org.apache.poi.hdgf.pointers.PointerFactory;
|
||||
|
@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest {
|
|||
assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
|
||||
assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
|
||||
}
|
||||
|
||||
public void testChunkWithText() throws Exception {
|
||||
// Parent ChunkStream is at 0x7194
|
||||
// This is one of the last children of the trailer
|
||||
Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt);
|
||||
TrailerStream ts = (TrailerStream)
|
||||
Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory);
|
||||
|
||||
ts.findChildren(contents);
|
||||
|
||||
assertNotNull(ts.getChildPointers());
|
||||
assertNotNull(ts.getPointedToStreams());
|
||||
assertEquals(20, ts.getChildPointers().length);
|
||||
assertEquals(20, ts.getPointedToStreams().length);
|
||||
|
||||
assertEquals(0x7194, ts.getChildPointers()[13].getOffset());
|
||||
assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset());
|
||||
|
||||
PointerContainingStream ps7194 = (PointerContainingStream)
|
||||
ts.getPointedToStreams()[13];
|
||||
|
||||
// First child is at 0x64b3
|
||||
assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset());
|
||||
assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset());
|
||||
|
||||
ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0];
|
||||
|
||||
// Should be 26bc bytes un-compressed
|
||||
assertEquals(0x26bc, cs.getStore().getContents().length);
|
||||
// And should have lots of children
|
||||
assertEquals(131, cs.getChunks().length);
|
||||
|
||||
// One of which is Text
|
||||
boolean hasText = false;
|
||||
for(int i=0; i<cs.getChunks().length; i++) {
|
||||
if(cs.getChunks()[i].getName().equals("Text")) {
|
||||
hasText = true;
|
||||
}
|
||||
}
|
||||
assertTrue(hasText);
|
||||
// Which is the 72nd command
|
||||
assertEquals("Text", cs.getChunks()[72].getName());
|
||||
|
||||
Chunk text = cs.getChunks()[72];
|
||||
assertEquals("Text", text.getName());
|
||||
|
||||
// Which contains our text
|
||||
assertEquals(1, text.getCommands().length);
|
||||
assertEquals("Test View\n", text.getCommands()[0].getValue());
|
||||
|
||||
|
||||
// Almost at the end is some more text
|
||||
assertEquals("Text", cs.getChunks()[128].getName());
|
||||
text = cs.getChunks()[128];
|
||||
assertEquals("Text", text.getName());
|
||||
|
||||
assertEquals(1, text.getCommands().length);
|
||||
assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue