mirror of https://github.com/apache/poi.git
More work understanding hpbf
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@686624 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
82ae0b3825
commit
caa6292337
|
@ -0,0 +1,75 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
====================================================================
|
||||||
|
-->
|
||||||
|
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
|
||||||
|
|
||||||
|
<document>
|
||||||
|
<header>
|
||||||
|
<title>POI-HPBF - A Guide to the Publisher File Format</title>
|
||||||
|
<subtitle>Overview</subtitle>
|
||||||
|
<authors>
|
||||||
|
<person name="Nick Burch" email="nick at torchbox dot com"/>
|
||||||
|
</authors>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<section><title>Document Streams</title>
|
||||||
|
<p>
|
||||||
|
The file is made up of a number of POIFS streams. A typical
|
||||||
|
file will be made up as follows:
|
||||||
|
</p>
|
||||||
|
<source>
|
||||||
|
Root Entry -
|
||||||
|
Objects -
|
||||||
|
(no children)
|
||||||
|
SummaryInformation <(0x05)SummaryInformation>
|
||||||
|
DocumentSummaryInformation <(0x05)DocumentSummaryInformation>
|
||||||
|
Escher -
|
||||||
|
EscherStm
|
||||||
|
EscherDelayStm
|
||||||
|
Quill -
|
||||||
|
QuillSub -
|
||||||
|
CONTENTS
|
||||||
|
CompObj <(0x01)CompObj>
|
||||||
|
Envelope
|
||||||
|
Contents
|
||||||
|
Internal <(0x03)Internal>
|
||||||
|
CompObj <(0x01)CompObj>
|
||||||
|
VBA -
|
||||||
|
(no children)
|
||||||
|
</source>
|
||||||
|
</section>
|
||||||
|
<section><title>Changing Text</title>
|
||||||
|
<p>If you make a change to the text of a file, but not change
|
||||||
|
how much text there is, then the <em>CONTENTS</em> stream
|
||||||
|
will undergo a small change, and the <em>Contents</em> stream
|
||||||
|
will undergo a large change.</p>
|
||||||
|
<p>If you make a change to the text of a file, and change the
|
||||||
|
amount of text there is, then both the <em>Contents</em> and
|
||||||
|
the <em>CONTENTS</em> streams change.</p>
|
||||||
|
</section>
|
||||||
|
<section><title>Changing Shapes</title>
|
||||||
|
<p>If you alter the size of a textbox, but make no text changes,
|
||||||
|
then both <em>Contents</em> and <em>CONTENTS</em> streams
|
||||||
|
change. There are no changes to the Escher streams.</p>
|
||||||
|
<p>If you set the background colour of a textbox, but make
|
||||||
|
no changes to the text,
|
||||||
|
</section>
|
||||||
|
</body>
|
||||||
|
</document>
|
|
@ -0,0 +1,53 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
====================================================================
|
||||||
|
-->
|
||||||
|
<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V1.1//EN" "../dtd/document-v11.dtd">
|
||||||
|
|
||||||
|
<document>
|
||||||
|
<header>
|
||||||
|
<title>POI-HPBF - Java API To Access Microsoft Publisher Format Files</title>
|
||||||
|
<subtitle>Overview</subtitle>
|
||||||
|
<authors>
|
||||||
|
<person name="Nick Burch" email="nick at apache dot org"/>
|
||||||
|
</authors>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<section>
|
||||||
|
<title>Overview</title>
|
||||||
|
|
||||||
|
<p>HPBF is the POI Project's pure Java implementation of the Visio file format.</p>
|
||||||
|
<p>Currently, HPBF is in the experimental stage, while we try
|
||||||
|
to figure out the file format. Our initial aim is to provide
|
||||||
|
a text extractor for the format, with low level code following
|
||||||
|
after that if demand and developer interest warrant it.</p>
|
||||||
|
<p>At this time, there is no <em>usermodel</em> api or similar.</p>
|
||||||
|
<p>Our current understanding of the file format is documented
|
||||||
|
<link href="file-format.html">here</a>.</p>
|
||||||
|
<note>
|
||||||
|
This code currently lives the
|
||||||
|
<link href="http://svn.apache.org/viewcvs.cgi/poi/trunk/src/scratchpad/">scratchpad area</link>
|
||||||
|
of the POI SVN repository.
|
||||||
|
Ensure that you have the scratchpad jar or the scratchpad
|
||||||
|
build area in your
|
||||||
|
classpath before experimenting with this code.
|
||||||
|
</note>
|
||||||
|
</section>
|
||||||
|
</body>
|
||||||
|
</document>
|
|
@ -25,6 +25,8 @@ import org.apache.poi.ddf.EscherRecord;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
import org.apache.poi.util.StringUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* For dumping out the contents of HPBF (Publisher)
|
* For dumping out the contents of HPBF (Publisher)
|
||||||
|
@ -52,6 +54,26 @@ public class HPBFDumper {
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dumps out the given number of bytes as hex,
|
||||||
|
* two chars
|
||||||
|
*/
|
||||||
|
private String dumpBytes(byte[] data, int offset, int len) {
|
||||||
|
StringBuffer ret = new StringBuffer();
|
||||||
|
for(int i=0; i<len; i++) {
|
||||||
|
int j = i + offset;
|
||||||
|
int b = data[j];
|
||||||
|
if(b < 0) { b += 256; }
|
||||||
|
|
||||||
|
String bs = Integer.toHexString(b);
|
||||||
|
if(bs.length() == 1)
|
||||||
|
ret.append('0');
|
||||||
|
ret.append(bs);
|
||||||
|
ret.append(' ');
|
||||||
|
}
|
||||||
|
return ret.toString();
|
||||||
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
if(args.length < 1) {
|
if(args.length < 1) {
|
||||||
System.err.println("Use:");
|
System.err.println("Use:");
|
||||||
|
@ -159,9 +181,61 @@ public class HPBFDumper {
|
||||||
System.out.println("");
|
System.out.println("");
|
||||||
System.out.println("CONTENTS - " + data.length + " bytes long:");
|
System.out.println("CONTENTS - " + data.length + " bytes long:");
|
||||||
|
|
||||||
// Dump out up to 0x200
|
// Between the start and 0x200 we have
|
||||||
|
// CHNKINK(space) + 24 bytes + 0x1800
|
||||||
|
// TEXT + 6 bytes
|
||||||
|
// TEXT + 8 bytes + 0x1800
|
||||||
|
// STSH + 6 bytes
|
||||||
|
// STSH + 8 bytes + 0x1800
|
||||||
|
// STSH + 6 bytes
|
||||||
|
// STSH + 8 bytes + 0x1800
|
||||||
|
// but towards 0x200 the pattern may
|
||||||
|
// break down a little bit
|
||||||
|
|
||||||
// Text from 0x200 onwards for a bit
|
// After the second of a given type,
|
||||||
|
// it seems to be 4 bytes giving the start,
|
||||||
|
// then 4 bytes giving the length, then
|
||||||
|
// 18 00
|
||||||
|
System.out.println(
|
||||||
|
new String(data, 0, 8) +
|
||||||
|
dumpBytes(data, 8, 0x22-8)
|
||||||
|
);
|
||||||
|
|
||||||
|
int pos = 0x22;
|
||||||
|
boolean sixNotTen = true;
|
||||||
|
while(pos < 0x200) {
|
||||||
|
String text = new String(data, pos, 4);
|
||||||
|
int blen = 10;
|
||||||
|
if(sixNotTen)
|
||||||
|
blen = 6;
|
||||||
|
System.out.println(
|
||||||
|
text + " " + dumpBytes(data, pos+4, blen)
|
||||||
|
);
|
||||||
|
|
||||||
|
pos += 4 + blen;
|
||||||
|
sixNotTen = ! sixNotTen;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Text from 0x200 onwards until we get
|
||||||
|
// to \r(00)\n(00)(00)(00)
|
||||||
|
int textStop = -1;
|
||||||
|
for(int i=0x200; i<data.length-2 && textStop == -1; i++) {
|
||||||
|
if(data[i] == 0 && data[i+1] == 0 && data[i+2] == 0) {
|
||||||
|
textStop = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(textStop > 0) {
|
||||||
|
int len = (textStop - 0x200) / 2;
|
||||||
|
System.out.println("");
|
||||||
|
System.out.println(
|
||||||
|
StringUtil.getFromUnicodeLE(data, 0x200, len)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The font list comes slightly later
|
||||||
|
|
||||||
|
// The hyperlinks may come before the fonts,
|
||||||
|
// or slightly in front
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void dump001CompObj(DirectoryNode dir) {
|
protected void dump001CompObj(DirectoryNode dir) {
|
||||||
|
|
Loading…
Reference in New Issue