Patch from bug #44937 from Squeeself- Partial support for extracting Escher images from HWPF files

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@658302 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Nick Burch 2008-05-20 16:01:53 +00:00
parent 7902d0ed3a
commit 40bb917404
11 changed files with 541 additions and 24 deletions

View File

@ -37,6 +37,7 @@
<!-- Don't forget to update status.xml too! -->
<release version="3.1-final" date="2008-06-??">
<action dev="POI-DEVELOPERS" type="add">44937 - Partial support for extracting Escher images from HWPF files</action>
<action dev="POI-DEVELOPERS" type="fix">44824 - Avoid an infinite loop when reading some HWPF pictures</action>
<action dev="POI-DEVELOPERS" type="fix">44898 - Correctly handle short last blocks in POIFS</action>
</release>

View File

@ -34,6 +34,7 @@
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1-final" date="2008-06-??">
<action dev="POI-DEVELOPERS" type="add">44937 - Partial support for extracting Escher images from HWPF files</action>
<action dev="POI-DEVELOPERS" type="fix">44824 - Avoid an infinite loop when reading some HWPF pictures</action>
<action dev="POI-DEVELOPERS" type="fix">44898 - Correctly handle short last blocks in POIFS</action>
</release>

View File

@ -65,6 +65,12 @@ public class EscherClientAnchorRecord
int size = 0;
// Always find 4 two byte entries. Sometimes find 9
if (bytesRemaining == 4) // Word format only 4 bytes
{
// Not sure exactly what the format is quite yet, likely a reference to a PLC
}
else
{
field_1_flag = LittleEndian.getShort( data, pos + size ); size += 2;
field_2_col1 = LittleEndian.getShort( data, pos + size ); size += 2;
field_3_dx1 = LittleEndian.getShort( data, pos + size ); size += 2;
@ -79,6 +85,7 @@ public class EscherClientAnchorRecord
} else {
shortRecord = true;
}
}
bytesRemaining -= size;
remainingData = new byte[bytesRemaining];
System.arraycopy( data, pos + size, remainingData, 0, bytesRemaining );

View File

@ -53,10 +53,10 @@ public class HWPFDocument extends POIDocument
protected FileInformationBlock _fib;
/** main document stream buffer*/
private byte[] _mainStream;
protected byte[] _mainStream;
/** table stream buffer*/
private byte[] _tableStream;
protected byte[] _tableStream;
/** data stream buffer*/
protected byte[] _dataStream;
@ -94,6 +94,12 @@ public class HWPFDocument extends POIDocument
/** Holds pictures table */
protected PicturesTable _pictures;
/** Holds FSBA (shape) information */
protected FSPATable _fspa;
/** Escher Drawing Group information */
protected EscherRecordHolder _dgg;
protected HWPFDocument()
{
super(null, null);
@ -205,9 +211,6 @@ public class HWPFDocument extends POIDocument
_dataStream = new byte[0];
}
// read in the pictures stream
_pictures = new PicturesTable(this, _dataStream);
// get the start of text in the main stream
int fcMin = _fib.getFcMin();
@ -227,6 +230,20 @@ public class HWPFDocument extends POIDocument
_pbt.adjustForDelete(0, 0, cpMin);
}
// Read FSPA and Escher information
_fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(), _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());
if (_fib.getFcDggInfo() != 0)
{
_dgg = new EscherRecordHolder(_tableStream, _fib.getFcDggInfo(), _fib.getLcbDggInfo());
} else
{
_dgg = new EscherRecordHolder();
}
// read in the pictures stream
_pictures = new PicturesTable(this, _dataStream, _mainStream, _fspa, _dgg);
_st = new SectionTable(_mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, getTextTable().getTextPieces());
_ss = new StyleSheet(_tableStream, _fib.getFcStshf());
_ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib.getLcbSttbfffn());

View File

@ -0,0 +1,116 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.ddf.DefaultEscherRecordFactory;
import org.apache.poi.ddf.EscherContainerRecord;
import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.ddf.EscherRecordFactory;
/**
* Based on AbstractEscherRecordHolder fomr HSSF.
*
* @author Squeeself
*/
public class EscherRecordHolder
{
protected ArrayList escherRecords = new ArrayList();
public EscherRecordHolder()
{
}
public EscherRecordHolder(byte[] data, int offset, int size)
{
fillEscherRecords(data, offset, size);
}
private void fillEscherRecords(byte[] data, int offset, int size)
{
EscherRecordFactory recordFactory = new DefaultEscherRecordFactory();
int pos = offset;
while ( pos < offset + size)
{
EscherRecord r = recordFactory.createRecord(data, pos);
escherRecords.add(r);
int bytesRead = r.fillFields(data, pos, recordFactory);
pos += bytesRead + 1; // There is an empty byte between each top-level record in a Word doc
}
}
public List getEscherRecords()
{
return escherRecords;
}
public String toString()
{
StringBuffer buffer = new StringBuffer();
final String nl = System.getProperty("line.separator");
if (escherRecords.size() == 0)
buffer.append("No Escher Records Decoded" + nl);
for ( Iterator iterator = escherRecords.iterator(); iterator.hasNext(); )
{
EscherRecord r = (EscherRecord) iterator.next();
buffer.append(r.toString());
}
return buffer.toString();
}
/**
* If we have a EscherContainerRecord as one of our
* children (and most top level escher holders do),
* then return that.
*/
public EscherContainerRecord getEscherContainer() {
for(Iterator it = escherRecords.iterator(); it.hasNext();) {
Object er = it.next();
if(er instanceof EscherContainerRecord) {
return (EscherContainerRecord)er;
}
}
return null;
}
/**
* Descends into all our children, returning the
* first EscherRecord with the given id, or null
* if none found
*/
public EscherRecord findFirstWithId(short id) {
return findFirstWithId(id, getEscherRecords());
}
private EscherRecord findFirstWithId(short id, List records) {
// Check at our level
for(Iterator it = records.iterator(); it.hasNext();) {
EscherRecord r = (EscherRecord)it.next();
if(r.getRecordId() == id) {
return r;
}
}
// Then check our children in turn
for(Iterator it = records.iterator(); it.hasNext();) {
EscherRecord r = (EscherRecord)it.next();
if(r.isContainerRecord()) {
EscherRecord found =
findFirstWithId(id, r.getChildRecords());
if(found != null) {
return found;
}
}
}
// Not found in this lot
return null;
}
}

View File

@ -0,0 +1,182 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import org.apache.poi.util.BitField;
import org.apache.poi.util.BitFieldFactory;
import org.apache.poi.util.LittleEndian;
/**
* File Shape Address structure
*
* @author Squeeself
*/
public class FSPA
{
public static final int FSPA_SIZE = 26;
private int spid; // Shape identifier. Used to get data position
private int xaLeft; // Enclosing rectangle
private int yaTop; // Enclosing rectangle
private int xaRight; // Enclosing rectangle
private int yaBottom; // Enclosing rectangle
private short options;
private static BitField fHdr = BitFieldFactory.getInstance(0x0001); // 1 in undo when in header
private static BitField bx = BitFieldFactory.getInstance(0x0006); // x pos relative to anchor CP: 0 - page margin, 1 - top of page, 2 - text, 3 - reserved
private static BitField by = BitFieldFactory.getInstance(0x0018); // y pos relative to anchor CP: ditto
private static BitField wr = BitFieldFactory.getInstance(0x01E0); // Text wrapping mode: 0 - like 2 w/o absolute, 1 - no text next to shape, 2 - wrap around absolute object, 3 - wrap as if no object, 4 - wrap tightly around object, 5 - wrap tightly, allow holes, 6-15 - reserved
private static BitField wrk = BitFieldFactory.getInstance(0x1E00); // Text wrapping mode type (for modes 2&4): 0 - wrap both sides, 1 - wrap only left, 2 - wrap only right, 3 - wrap largest side
private static BitField fRcaSimple = BitFieldFactory.getInstance(0x2000); // Overwrites bx if set, forcing rectangle to be page relative
private static BitField fBelowText = BitFieldFactory.getInstance(0x4000); // if true, shape is below text, otherwise above
private static BitField fAnchorLock = BitFieldFactory.getInstance(0x8000); // if true, anchor is locked
private int cTxbx; // Count of textboxes in shape (undo doc only)
public FSPA()
{
}
public FSPA(byte[] bytes, int offset)
{
spid = LittleEndian.getInt(bytes, offset);
offset += LittleEndian.INT_SIZE;
xaLeft = LittleEndian.getInt(bytes, offset);
offset += LittleEndian.INT_SIZE;
yaTop = LittleEndian.getInt(bytes, offset);
offset += LittleEndian.INT_SIZE;
xaRight = LittleEndian.getInt(bytes, offset);
offset += LittleEndian.INT_SIZE;
yaBottom = LittleEndian.getInt(bytes, offset);
offset += LittleEndian.INT_SIZE;
options = LittleEndian.getShort(bytes, offset);
offset += LittleEndian.SHORT_SIZE;
cTxbx = LittleEndian.getInt(bytes, offset);
}
public int getSpid()
{
return spid;
}
public int getXaLeft()
{
return xaLeft;
}
public int getYaTop()
{
return yaTop;
}
public int getXaRight()
{
return xaRight;
}
public int getYaBottom()
{
return yaBottom;
}
public boolean isFHdr()
{
return fHdr.isSet(options);
}
public short getBx()
{
return bx.getShortValue(options);
}
public short getBy()
{
return by.getShortValue(options);
}
public short getWr()
{
return wr.getShortValue(options);
}
public short getWrk()
{
return wrk.getShortValue(options);
}
public boolean isFRcaSimple()
{
return fRcaSimple.isSet(options);
}
public boolean isFBelowText()
{
return fBelowText.isSet(options);
}
public boolean isFAnchorLock()
{
return fAnchorLock.isSet(options);
}
public int getCTxbx()
{
return cTxbx;
}
public byte[] toByteArray()
{
int offset = 0;
byte[] buf = new byte[FSPA_SIZE];
LittleEndian.putInt(buf, offset, spid);
offset += LittleEndian.INT_SIZE;
LittleEndian.putInt(buf, offset, xaLeft);
offset += LittleEndian.INT_SIZE;
LittleEndian.putInt(buf, offset, yaTop);
offset += LittleEndian.INT_SIZE;
LittleEndian.putInt(buf, offset, xaRight);
offset += LittleEndian.INT_SIZE;
LittleEndian.putInt(buf, offset, yaBottom);
offset += LittleEndian.INT_SIZE;
LittleEndian.putShort(buf, offset, options);
offset += LittleEndian.SHORT_SIZE;
LittleEndian.putInt(buf, offset, cTxbx);
offset += LittleEndian.INT_SIZE;
return buf;
}
public String toString()
{
StringBuffer buf = new StringBuffer();
buf.append("spid: ").append(spid);
buf.append(", xaLeft: ").append(xaLeft);
buf.append(", yaTop: ").append(yaTop);
buf.append(", xaRight: ").append(xaRight);
buf.append(", yaBottom: ").append(yaBottom);
buf.append(", options: ").append(options);
buf.append(" (fHdr: ").append(isFHdr());
buf.append(", bx: ").append(getBx());
buf.append(", by: ").append(getBy());
buf.append(", wr: ").append(getWr());
buf.append(", wrk: ").append(getWrk());
buf.append(", fRcaSimple: ").append(isFRcaSimple());
buf.append(", fBelowText: ").append(isFBelowText());
buf.append(", fAnchorLock: ").append(isFAnchorLock());
buf.append("), cTxbx: ").append(cTxbx);
return buf.toString();
}
}

View File

@ -0,0 +1,82 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
/**
* This class holds all the FSPA (File Shape Address) structures.
*
* @author Squeeself
*/
public class FSPATable
{
protected ArrayList shapes = new ArrayList();
protected HashMap cps = new HashMap();
protected List _text;
public FSPATable(byte[] tableStream, int fcPlcspa, int lcbPlcspa, List tpt)
{
_text = tpt;
// Will be 0 if no drawing objects in document
if (fcPlcspa == 0)
return;
PlexOfCps plex = new PlexOfCps(tableStream, fcPlcspa, lcbPlcspa, FSPA.FSPA_SIZE);
for (int i=0; i < plex.length(); i++)
{
GenericPropertyNode property = plex.getProperty(i);
FSPA fspa = new FSPA(property.getBytes(), 0);
shapes.add(fspa);
cps.put(Integer.valueOf(property.getStart()), Integer.valueOf(i));
}
}
public FSPA getFspaFromCp(int cp)
{
Integer idx = (Integer)cps.get(Integer.valueOf(cp));
if (idx == null)
return null;
return (FSPA)shapes.get(idx.intValue());
}
public List getShapes()
{
return shapes;
}
public String toString()
{
StringBuffer buf = new StringBuffer();
buf.append("[FPSA PLC size=").append(shapes.size()).append("]\n");
for (Iterator it = cps.keySet().iterator(); it.hasNext(); )
{
Integer i = (Integer) it.next();
FSPA fspa = (FSPA) shapes.get(((Integer)cps.get(i)).intValue());
buf.append(" [FC: ").append(i.toString()).append("] ");
buf.append(fspa.toString());
buf.append("\n");
}
buf.append("[/FSPA PLC]");
return buf.toString();
}
}

View File

@ -309,6 +309,26 @@ public class FileInformationBlock extends FIBAbstractType
return _fieldHandler.getFieldSize(FIBFieldHandler.PLCFFLDMOM);
}
public int getFcPlcspaMom()
{
return _fieldHandler.getFieldOffset(FIBFieldHandler.PLCSPAMOM);
}
public int getLcbPlcspaMom()
{
return _fieldHandler.getFieldSize(FIBFieldHandler.PLCSPAMOM);
}
public int getFcDggInfo()
{
return _fieldHandler.getFieldOffset(FIBFieldHandler.DGGINFO);
}
public int getLcbDggInfo()
{
return _fieldHandler.getFieldSize(FIBFieldHandler.DGGINFO);
}
public void writeTo (byte[] mainStream, HWPFOutputStream tableStream)
throws IOException
{

View File

@ -26,7 +26,12 @@ import org.apache.poi.hwpf.usermodel.Range;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.poi.ddf.DefaultEscherRecordFactory;
import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherBlipRecord;
import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.ddf.EscherRecordFactory;
/**
* Holds information about all pictures embedded in Word Document either via "Insert -> Picture -> From File" or via
@ -57,6 +62,9 @@ public class PicturesTable
private HWPFDocument _document;
private byte[] _dataStream;
private byte[] _mainStream;
private FSPATable _fspa;
private EscherRecordHolder _dgg;
/** @link dependency
* @stereotype instantiate*/
@ -67,10 +75,13 @@ public class PicturesTable
* @param document
* @param _dataStream
*/
public PicturesTable(HWPFDocument _document, byte[] _dataStream)
public PicturesTable(HWPFDocument _document, byte[] _dataStream, byte[] _mainStream, FSPATable fspa, EscherRecordHolder dgg)
{
this._document = _document;
this._dataStream = _dataStream;
this._mainStream = _mainStream;
this._fspa = fspa;
this._dgg = dgg;
}
/**
@ -84,6 +95,13 @@ public class PicturesTable
return false;
}
public boolean hasEscherPicture(CharacterRun run) {
if (run.isSpecialCharacter() && !run.isObj() && !run.isOle2() && !run.isData() && run.text().startsWith("\u0008")) {
return true;
}
return false;
}
/**
* determines whether specified CharacterRun contains reference to a picture
* @param run
@ -123,6 +141,46 @@ public class PicturesTable
return null;
}
/**
* Performs a recursive search for pictures in the given list of escher records.
*
* @param escherRecords the escher records.
* @param pictures the list to populate with the pictures.
*/
private void searchForPictures(List escherRecords, List pictures)
{
Iterator recordIter = escherRecords.iterator();
while (recordIter.hasNext())
{
Object obj = recordIter.next();
if (obj instanceof EscherRecord)
{
EscherRecord escherRecord = (EscherRecord) obj;
if (escherRecord instanceof EscherBSERecord)
{
EscherBSERecord bse = (EscherBSERecord) escherRecord;
EscherBlipRecord blip = bse.getBlipRecord();
if (blip != null)
{
pictures.add(new Picture(blip.getPicturedata()));
}
else if (bse.getOffset() > 0)
{
// Blip stored in delay stream, which in a word doc, is the main stream
EscherRecordFactory recordFactory = new DefaultEscherRecordFactory();
blip = (EscherBlipRecord) recordFactory.createRecord(_mainStream, bse.getOffset());
blip.fillFields(_mainStream, bse.getOffset(), recordFactory);
pictures.add(new Picture(blip.getPicturedata()));
}
}
// Recursive call.
searchForPictures(escherRecord.getChildRecords(), pictures);
}
}
}
/**
* Not all documents have all the images concatenated in the data stream
* although MS claims so. The best approach is to scan all character runs.
@ -136,13 +194,14 @@ public class PicturesTable
for (int i = 0; i < range.numCharacterRuns(); i++) {
CharacterRun run = range.getCharacterRun(i);
String text = run.text();
int j = text.charAt(0);
Picture picture = extractPicture(run, false);
if (picture != null) {
pictures.add(picture);
}
}
searchForPictures(_dgg.getEscherRecords(), pictures);
return pictures;
}

View File

@ -99,6 +99,15 @@ public class Picture
}
}
public Picture(byte[] _dataStream)
{
this._dataStream = _dataStream;
this.dataBlockStartOfsset = 0;
this.dataBlockSize = _dataStream.length;
this.pictureBytesStartOffset = 0;
this.size = _dataStream.length;
}
private void fillWidthHeight()
{
String ext = suggestFileExtension();

View File

@ -35,10 +35,12 @@ public class TestHWPFPictures extends TestCase {
private String docAFile;
private String docBFile;
private String docCFile;
private String docDFile;
private String imgAFile;
private String imgBFile;
private String imgCFile;
private String imgDFile;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
@ -46,10 +48,12 @@ public class TestHWPFPictures extends TestCase {
docAFile = dirname + "/testPictures.doc";
docBFile = dirname + "/two_images.doc";
docCFile = dirname + "/vector_image.doc";
docDFile = dirname + "/GaiaTest.doc";
imgAFile = dirname + "/simple_image.jpg";
imgBFile = dirname + "/simple_image.png";
imgCFile = dirname + "/vector_image.emf";
imgDFile = dirname + "/GaiaTestImg.png";
}
/**
@ -127,6 +131,25 @@ public class TestHWPFPictures extends TestCase {
assertBytesSame(picBytes, pic.getContent());
}
/**
* Pending the missing files being uploaded to
* bug #44937
*/
public void BROKENtestEscherDrawing() throws Exception
{
HWPFDocument docD = new HWPFDocument(new FileInputStream(docDFile));
List allPictures = docD.getPicturesTable().getAllPictures();
assertEquals(1, allPictures.size());
Picture pic = (Picture) allPictures.get(0);
assertNotNull(pic);
byte[] picD = readFile(imgDFile);
assertEquals(picD.length, pic.getContent().length);
assertBytesSame(picD, pic.getContent());
}
private void assertBytesSame(byte[] a, byte[] b) {
assertEquals(a.length, b.length);