From 21885a6fd5da01551cf8dd22a7e9af919250bed9 Mon Sep 17 00:00:00 2001 From: Yegor Kozlov Date: Mon, 20 Jun 2011 15:56:28 +0000 Subject: [PATCH] bug 51351: more progress with WordToFoExtractor: support for hyperlinks, common fields and code cleanup git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1137673 13f79535-47bb-0310-9956-ffa450edef68 --- .../hwpf/extractor/AbstractToFoExtractor.java | 206 ++++ .../poi/hwpf/extractor/WordToFoExtractor.java | 1059 +++++++++-------- .../hwpf/extractor/TestWordToFoExtractor.java | 95 ++ test-data/document/equation.doc | Bin 0 -> 13824 bytes test-data/document/hyperlink.doc | Bin 0 -> 9728 bytes test-data/document/pageref.doc | Bin 0 -> 9728 bytes 6 files changed, 867 insertions(+), 493 deletions(-) create mode 100644 src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java create mode 100644 src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractor.java create mode 100644 test-data/document/equation.doc create mode 100644 test-data/document/hyperlink.doc create mode 100644 test-data/document/pageref.doc diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java new file mode 100644 index 0000000000..4ac0eead72 --- /dev/null +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/AbstractToFoExtractor.java @@ -0,0 +1,206 @@ +/* + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ +package org.apache.poi.hwpf.extractor; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Text; + +public abstract class AbstractToFoExtractor +{ + + private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format"; + + protected final Document document; + protected final Element layoutMasterSet; + protected final Element root; + + public AbstractToFoExtractor( Document document ) + { + this.document = document; + + root = document.createElementNS( NS_XSLFO, "fo:root" ); + document.appendChild( root ); + + layoutMasterSet = document.createElementNS( NS_XSLFO, + "fo:layout-master-set" ); + root.appendChild( layoutMasterSet ); + } + + protected Element addFlowToPageSequence( final Element pageSequence, + String flowName ) + { + final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" ); + flow.setAttribute( "flow-name", flowName ); + pageSequence.appendChild( flow ); + + return flow; + } + + protected Element addListItem( Element listBlock ) + { + Element result = createListItem(); + listBlock.appendChild( result ); + return result; + } + + protected Element addListItemBody( Element listItem ) + { + Element result = createListItemBody(); + listItem.appendChild( result ); + return result; + } + + protected Element addListItemLabel( Element listItem, String text ) + { + Element result = createListItemLabel( text ); + listItem.appendChild( result ); + return result; + } + + protected Element addPageSequence( String pageMaster ) + { + final Element pageSequence = document.createElementNS( NS_XSLFO, + "fo:page-sequence" ); + pageSequence.setAttribute( "master-reference", pageMaster ); + root.appendChild( pageSequence ); + return pageSequence; + } + + protected Element addRegionBody( Element pageMaster ) + { + final Element regionBody = document.createElementNS( NS_XSLFO, + "fo:region-body" ); + pageMaster.appendChild( regionBody ); + + return regionBody; + } + + protected Element addSimplePageMaster( String masterName ) + { + final Element simplePageMaster = document.createElementNS( NS_XSLFO, + "fo:simple-page-master" ); + simplePageMaster.setAttribute( "master-name", masterName ); + layoutMasterSet.appendChild( simplePageMaster ); + + return simplePageMaster; + } + + protected Element addTable( Element flow ) + { + final Element table = document.createElementNS( NS_XSLFO, "fo:table" ); + flow.appendChild( table ); + return table; + } + + protected Element createBasicLinkExternal( String externalDestination ) + { + final Element basicLink = document.createElementNS( NS_XSLFO, + "fo:basic-link" ); + basicLink.setAttribute( "external-destination", externalDestination ); + return basicLink; + } + + protected Element createBasicLinkInternal( String internalDestination ) + { + final Element basicLink = document.createElementNS( NS_XSLFO, + "fo:basic-link" ); + basicLink.setAttribute( "internal-destination", internalDestination ); + return basicLink; + } + + protected Element createBlock() + { + return document.createElementNS( NS_XSLFO, "fo:block" ); + } + + protected Element createExternalGraphic( String source ) + { + Element result = document.createElementNS( NS_XSLFO, + "fo:external-graphic" ); + result.setAttribute( "src", "url('" + source + "')" ); + return result; + } + + protected Element createInline() + { + return document.createElementNS( NS_XSLFO, "fo:inline" ); + } + + protected Element createLeader() + { + return document.createElementNS( NS_XSLFO, "fo:leader" ); + } + + protected Element createListBlock() + { + return document.createElementNS( NS_XSLFO, "fo:list-block" ); + } + + protected Element createListItem() + { + return document.createElementNS( NS_XSLFO, "fo:list-item" ); + } + + protected Element createListItemBody() + { + return document.createElementNS( NS_XSLFO, "fo:list-item-body" ); + } + + protected Element createListItemLabel( String text ) + { + Element result = document.createElementNS( NS_XSLFO, + "fo:list-item-label" ); + Element block = createBlock(); + block.appendChild( document.createTextNode( text ) ); + result.appendChild( block ); + return result; + } + + protected Element createTableBody() + { + return document.createElementNS( NS_XSLFO, "fo:table-body" ); + } + + protected Element createTableCell() + { + return document.createElementNS( NS_XSLFO, "fo:table-cell" ); + } + + protected Element createTableHeader() + { + return document.createElementNS( NS_XSLFO, "fo:table-header" ); + } + + protected Element createTableRow() + { + return document.createElementNS( NS_XSLFO, "fo:table-row" ); + } + + protected Text createText( String data ) + { + return document.createTextNode( data ); + } + + public Document getDocument() + { + return document; + } + +} diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java index 8e2013fbce..b9022c916e 100644 --- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java +++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordToFoExtractor.java @@ -16,7 +16,6 @@ * limitations under the License. * ==================================================================== */ - package org.apache.poi.hwpf.extractor; import java.io.File; @@ -25,6 +24,9 @@ import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; import java.util.Map; +import java.util.Stack; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; @@ -46,6 +48,8 @@ import org.apache.poi.hwpf.usermodel.Table; import org.apache.poi.hwpf.usermodel.TableCell; import org.apache.poi.hwpf.usermodel.TableIterator; import org.apache.poi.hwpf.usermodel.TableRow; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Text; @@ -55,7 +59,30 @@ import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH; /** * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) */ -public class WordToFoExtractor { +public class WordToFoExtractor extends AbstractToFoExtractor +{ + + /** + * Holds properties values, applied to current fo:block element. + * Those properties shall not be doubled in children fo:inline + * elements. + */ + private static class BlockProperies + { + final boolean pBold; + final String pFontName; + final int pFontSize; + final boolean pItalic; + + public BlockProperies( String pFontName, int pFontSize, boolean pBold, + boolean pItalic ) + { + this.pFontName = pFontName; + this.pFontSize = pFontSize; + this.pBold = pBold; + this.pItalic = pItalic; + } + } private static final byte BEL_MARK = 7; @@ -65,218 +92,311 @@ public class WordToFoExtractor { private static final byte FIELD_SEPARATOR_MARK = 20; - private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format"; + private static final POILogger logger = POILogFactory + .getLogger( WordToFoExtractor.class ); - private static HWPFDocument loadDoc(File docFile) throws IOException { - final FileInputStream istream = new FileInputStream(docFile); - try { - return new HWPFDocument(istream); - } finally { - try { - istream.close(); - } catch (Exception exc) { - // no op - } - } + private static HWPFDocument loadDoc( File docFile ) throws IOException + { + final FileInputStream istream = new FileInputStream( docFile ); + try + { + return new HWPFDocument( istream ); + } + finally + { + try + { + istream.close(); + } + catch ( Exception exc ) + { + logger.log( POILogger.ERROR, + "Unable to close FileInputStream: " + exc, exc ); + } + } } - static Document process(File docFile) throws Exception { - final HWPFDocument hwpfDocument = loadDoc(docFile); - WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( - DocumentBuilderFactory.newInstance().newDocumentBuilder() - .newDocument()); - wordToFoExtractor.processDocument(hwpfDocument); - return wordToFoExtractor.getDocument(); + /** + * Java main() interface to interact with WordToFoExtractor + * + *

+ * Usage: WordToFoExtractor infile outfile + *

+ * Where infile is an input .doc file ( Word 97-2007) which will be rendered + * as XSL-FO into outfile + * + */ + public static void main( String[] args ) + { + if ( args.length < 2 ) + { + System.err + .println( "Usage: WordToFoExtractor " ); + return; + } + + System.out.println( "Converting " + args[0] ); + System.out.println( "Saving output to " + args[1] ); + try + { + Document doc = WordToFoExtractor.process( new File( args[0] ) ); + + FileWriter out = new FileWriter( args[1] ); + DOMSource domSource = new DOMSource( doc ); + StreamResult streamResult = new StreamResult( out ); + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer serializer = tf.newTransformer(); + // TODO set encoding from a command argument + serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" ); + serializer.setOutputProperty( OutputKeys.INDENT, "yes" ); + serializer.transform( domSource, streamResult ); + out.close(); + } + catch ( Exception e ) + { + e.printStackTrace(); + } } - private final Document document; - - private final Element layoutMasterSet; - - private final Element root; - - public WordToFoExtractor(Document document) throws Exception { - this.document = document; - - root = document.createElementNS(NS_XSLFO, "fo:root"); - document.appendChild(root); - - layoutMasterSet = document.createElementNS(NS_XSLFO, - "fo:layout-master-set"); - root.appendChild(layoutMasterSet); + static Document process( File docFile ) throws Exception + { + final HWPFDocument hwpfDocument = loadDoc( docFile ); + WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToFoExtractor.processDocument( hwpfDocument ); + return wordToFoExtractor.getDocument(); } - protected Element addFlowToPageSequence(final Element pageSequence, - String flowName) { - final Element flow = document.createElementNS(NS_XSLFO, "fo:flow"); - flow.setAttribute("flow-name", flowName); - pageSequence.appendChild(flow); + private final Stack blocksProperies = new Stack(); - return flow; + /** + * Creates new instance of {@link WordToFoExtractor}. Can be used for output + * several {@link HWPFDocument}s into single FO document. + * + * @param document + * XML DOM Document used as XSL FO document. Shall support + * namespaces + */ + public WordToFoExtractor( Document document ) + { + super( document ); } - protected Element addListItem(Element listBlock) { - Element result = createListItem(); - listBlock.appendChild(result); - return result; + protected String createPageMaster( SectionProperties sep, String type, + int section ) + { + float height = sep.getYaPage() / TWIPS_PER_INCH; + float width = sep.getXaPage() / TWIPS_PER_INCH; + float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH; + float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH; + float topMargin = sep.getDyaTop() / TWIPS_PER_INCH; + float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH; + + // add these to the header + String pageMasterName = type + "-page" + section; + + Element pageMaster = addSimplePageMaster( pageMasterName ); + pageMaster.setAttribute( "page-height", height + "in" ); + pageMaster.setAttribute( "page-width", width + "in" ); + + Element regionBody = addRegionBody( pageMaster ); + regionBody.setAttribute( "margin", topMargin + "in " + rightMargin + + "in " + bottomMargin + "in " + leftMargin + "in" ); + + /* + * 6.4.14 fo:region-body + * + * The values of the padding and border-width traits must be "0". + */ + // WordToFoUtils.setBorder(regionBody, sep.getBrcTop(), "top"); + // WordToFoUtils.setBorder(regionBody, sep.getBrcBottom(), "bottom"); + // WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left"); + // WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right"); + + if ( sep.getCcolM1() > 0 ) + { + regionBody + .setAttribute( "column-count", "" + (sep.getCcolM1() + 1) ); + if ( sep.getFEvenlySpaced() ) + { + regionBody.setAttribute( "column-gap", + (sep.getDxaColumns() / TWIPS_PER_INCH) + "in" ); + } + else + { + regionBody.setAttribute( "column-gap", "0.25in" ); + } + } + + return pageMasterName; } - protected Element addListItemBody(Element listItem) { - Element result = createListItemBody(); - listItem.appendChild(result); - return result; + protected boolean processCharacters( HWPFDocument hwpfDocument, + int currentTableLevel, Paragraph paragraph, final Element block, + final int start, final int end ) + { + boolean haveAnyText = false; + + for ( int c = start; c < end; c++ ) + { + CharacterRun characterRun = paragraph.getCharacterRun( c ); + + if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) ) + { + Picture picture = hwpfDocument.getPicturesTable() + .extractPicture( characterRun, true ); + + processImage( block, characterRun.text().charAt( 0 ) == 0x01, + picture ); + continue; + } + + String text = characterRun.text(); + if ( text.getBytes().length == 0 ) + continue; + + if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) + { + int skipTo = tryField( hwpfDocument, paragraph, + currentTableLevel, c, block ); + + if ( skipTo != c ) + { + c = skipTo; + continue; + } + + continue; + } + if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) + { + // shall not appear without FIELD_BEGIN_MARK + continue; + } + if ( text.getBytes()[0] == FIELD_END_MARK ) + { + // shall not appear without FIELD_BEGIN_MARK + continue; + } + + if ( characterRun.isSpecialCharacter() || characterRun.isObj() + || characterRun.isOle2() ) + { + continue; + } + + BlockProperies blockProperies = this.blocksProperies.peek(); + Element inline = createInline(); + if ( characterRun.isBold() != blockProperies.pBold ) + { + WordToFoUtils.setBold( inline, characterRun.isBold() ); + } + if ( characterRun.isItalic() != blockProperies.pItalic ) + { + WordToFoUtils.setItalic( inline, characterRun.isItalic() ); + } + if ( !WordToFoUtils.equals( characterRun.getFontName(), + blockProperies.pFontName ) ) + { + WordToFoUtils + .setFontFamily( inline, characterRun.getFontName() ); + } + if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize ) + { + WordToFoUtils.setFontSize( inline, + characterRun.getFontSize() / 2 ); + } + WordToFoUtils.setCharactersProperties( characterRun, inline ); + block.appendChild( inline ); + + if ( text.endsWith( "\r" ) + || (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) ) + text = text.substring( 0, text.length() - 1 ); + + Text textNode = createText( text ); + inline.appendChild( textNode ); + + haveAnyText |= text.trim().length() != 0; + } + + return haveAnyText; } - protected Element addListItemLabel(Element listItem, String text) { - Element result = createListItemLabel(text); - listItem.appendChild(result); - return result; + public void processDocument( HWPFDocument hwpfDocument ) + { + final Range range = hwpfDocument.getRange(); + + for ( int s = 0; s < range.numSections(); s++ ) + { + processSection( hwpfDocument, range.getSection( s ), s ); + } } - protected Element addPageSequence(String pageMaster) { - final Element pageSequence = document.createElementNS(NS_XSLFO, - "fo:page-sequence"); - pageSequence.setAttribute("master-reference", pageMaster); - root.appendChild(pageSequence); - return pageSequence; + protected void processField( HWPFDocument hwpfDocument, + Element currentBlock, Paragraph paragraph, int currentTableLevel, + int beginMark, int separatorMark, int endMark ) + { + + Pattern hyperlinkPattern = Pattern + .compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" ); + Pattern pagerefPattern = Pattern + .compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" ); + + if ( separatorMark - beginMark > 1 ) + { + CharacterRun firstAfterBegin = paragraph + .getCharacterRun( beginMark + 1 ); + + final Matcher hyperlinkMatcher = hyperlinkPattern + .matcher( firstAfterBegin.text() ); + if ( hyperlinkMatcher.matches() ) + { + String hyperlink = hyperlinkMatcher.group( 1 ); + processHyperlink( hwpfDocument, currentBlock, paragraph, + currentTableLevel, hyperlink, separatorMark + 1, + endMark ); + return; + } + + final Matcher pagerefMatcher = pagerefPattern + .matcher( firstAfterBegin.text() ); + if ( pagerefMatcher.matches() ) + { + String pageref = pagerefMatcher.group( 1 ); + processPageref( hwpfDocument, currentBlock, paragraph, + currentTableLevel, pageref, separatorMark + 1, endMark ); + return; + } + } + + StringBuilder debug = new StringBuilder( "Unsupported field type: \n" ); + for ( int i = beginMark; i <= endMark; i++ ) + { + debug.append( "\t" ); + debug.append( paragraph.getCharacterRun( i ) ); + debug.append( "\n" ); + } + logger.log( POILogger.WARN, debug ); + + // just output field value + if ( separatorMark + 1 < endMark ) + processCharacters( hwpfDocument, currentTableLevel, paragraph, + currentBlock, separatorMark + 1, endMark ); + + return; } - protected Element addRegionBody(Element pageMaster) { - final Element regionBody = document.createElementNS(NS_XSLFO, - "fo:region-body"); - pageMaster.appendChild(regionBody); + protected void processHyperlink( HWPFDocument hwpfDocument, + Element currentBlock, Paragraph paragraph, int currentTableLevel, + String hyperlink, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = createBasicLinkExternal( hyperlink ); + currentBlock.appendChild( basicLink ); - return regionBody; - } - - protected Element addSimplePageMaster(String masterName) { - final Element simplePageMaster = document.createElementNS(NS_XSLFO, - "fo:simple-page-master"); - simplePageMaster.setAttribute("master-name", masterName); - layoutMasterSet.appendChild(simplePageMaster); - - return simplePageMaster; - } - - protected Element addTable(Element flow) { - final Element table = document.createElementNS(NS_XSLFO, "fo:table"); - flow.appendChild(table); - return table; - } - - protected Element createBlock() { - return document.createElementNS(NS_XSLFO, "fo:block"); - } - - protected Element createExternalGraphic(String source) { - Element result = document.createElementNS(NS_XSLFO, - "fo:external-graphic"); - result.setAttribute("src", "url('" + source + "')"); - return result; - } - - protected Element createInline() { - return document.createElementNS(NS_XSLFO, "fo:inline"); - } - - protected Element createLeader() { - return document.createElementNS(NS_XSLFO, "fo:leader"); - } - - protected Element createListBlock() { - return document.createElementNS(NS_XSLFO, "fo:list-block"); - } - - protected Element createListItem() { - return document.createElementNS(NS_XSLFO, "fo:list-item"); - } - - protected Element createListItemBody() { - return document.createElementNS(NS_XSLFO, "fo:list-item-body"); - } - - protected Element createListItemLabel(String text) { - Element result = document.createElementNS(NS_XSLFO, - "fo:list-item-label"); - Element block = createBlock(); - block.appendChild(document.createTextNode(text)); - result.appendChild(block); - return result; - } - - protected String createPageMaster(SectionProperties sep, String type, - int section) { - float height = sep.getYaPage() / TWIPS_PER_INCH; - float width = sep.getXaPage() / TWIPS_PER_INCH; - float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH; - float rightMargin = sep.getDxaRight() / TWIPS_PER_INCH; - float topMargin = sep.getDyaTop() / TWIPS_PER_INCH; - float bottomMargin = sep.getDyaBottom() / TWIPS_PER_INCH; - - // add these to the header - String pageMasterName = type + "-page" + section; - - Element pageMaster = addSimplePageMaster(pageMasterName); - pageMaster.setAttribute("page-height", height + "in"); - pageMaster.setAttribute("page-width", width + "in"); - - Element regionBody = addRegionBody(pageMaster); - regionBody.setAttribute("margin", topMargin + "in " + rightMargin - + "in " + bottomMargin + "in " + leftMargin + "in"); - - /* - * 6.4.14 fo:region-body - * - * The values of the padding and border-width traits must be "0". - */ - // WordToFoUtils.setBorder(regionBody, sep.getBrcTop(), "top"); - // WordToFoUtils.setBorder(regionBody, sep.getBrcBottom(), "bottom"); - // WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left"); - // WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right"); - - if (sep.getCcolM1() > 0) { - regionBody.setAttribute("column-count", "" + (sep.getCcolM1() + 1)); - if (sep.getFEvenlySpaced()) { - regionBody.setAttribute("column-gap", - (sep.getDxaColumns() / TWIPS_PER_INCH) + "in"); - } else { - regionBody.setAttribute("column-gap", "0.25in"); - } - } - - return pageMasterName; - } - - protected Element createTableBody() { - return document.createElementNS(NS_XSLFO, "fo:table-body"); - } - - protected Element createTableCell() { - return document.createElementNS(NS_XSLFO, "fo:table-cell"); - } - - protected Element createTableHeader() { - return document.createElementNS(NS_XSLFO, "fo:table-header"); - } - - protected Element createTableRow() { - return document.createElementNS(NS_XSLFO, "fo:table-row"); - } - - protected Text createText(String data) { - return document.createTextNode(data); - } - - public Document getDocument() { - return document; - } - - public void processDocument(HWPFDocument hwpfDocument) { - final Range range = hwpfDocument.getRange(); - - for (int s = 0; s < range.numSections(); s++) { - processSection(hwpfDocument, range.getSection(s), s); - } + if ( beginTextInclusive < endTextExclusive ) + processCharacters( hwpfDocument, currentTableLevel, paragraph, + basicLink, beginTextInclusive, endTextExclusive ); } /** @@ -298,304 +418,304 @@ public class WordToFoExtractor { * @param picture * HWPF object, contained picture data and properties */ - protected void processImage(Element currentBlock, boolean inlined, - Picture picture) { + protected void processImage( Element currentBlock, boolean inlined, + Picture picture ) + { // no default implementation -- skip + currentBlock.appendChild( document.createComment( "Image link to '" + + picture.suggestFullFileName() + "' can be here" ) ); } - protected void processParagraph(HWPFDocument hwpfDocument, - Element parentFopElement, int currentTableLevel, - Paragraph paragraph, String bulletText) { - final Element block = createBlock(); - parentFopElement.appendChild(block); + protected void processPageref( HWPFDocument hwpfDocument, + Element currentBlock, Paragraph paragraph, int currentTableLevel, + String pageref, int beginTextInclusive, int endTextExclusive ) + { + Element basicLink = createBasicLinkInternal( pageref ); + currentBlock.appendChild( basicLink ); - WordToFoUtils.setParagraphProperties(paragraph, block); + if ( beginTextInclusive < endTextExclusive ) + processCharacters( hwpfDocument, currentTableLevel, paragraph, + basicLink, beginTextInclusive, endTextExclusive ); + } + + protected void processParagraph( HWPFDocument hwpfDocument, + Element parentFopElement, int currentTableLevel, + Paragraph paragraph, String bulletText ) + { + final Element block = createBlock(); + parentFopElement.appendChild( block ); + + WordToFoUtils.setParagraphProperties( paragraph, block ); final int charRuns = paragraph.numCharacterRuns(); - if (charRuns == 0) { + if ( charRuns == 0 ) + { return; } - final String pFontName; - final int pFontSize; - final boolean pBold; - final boolean pItalic; { - CharacterRun characterRun = paragraph.getCharacterRun(0); - pFontSize = characterRun.getFontSize() / 2; - pFontName = characterRun.getFontName(); - pBold = characterRun.isBold(); - pItalic = characterRun.isItalic(); + final String pFontName; + final int pFontSize; + final boolean pBold; + final boolean pItalic; + { + CharacterRun characterRun = paragraph.getCharacterRun( 0 ); + pFontSize = characterRun.getFontSize() / 2; + pFontName = characterRun.getFontName(); + pBold = characterRun.isBold(); + pItalic = characterRun.isItalic(); + } + WordToFoUtils.setFontFamily( block, pFontName ); + WordToFoUtils.setFontSize( block, pFontSize ); + WordToFoUtils.setBold( block, pBold ); + WordToFoUtils.setItalic( block, pItalic ); + + blocksProperies.push( new BlockProperies( pFontName, pFontSize, + pBold, pItalic ) ); } - WordToFoUtils.setFontFamily(block, pFontName); - WordToFoUtils.setFontSize(block, pFontSize); - WordToFoUtils.setBold(block, pBold); - WordToFoUtils.setItalic(block, pItalic); + try + { + boolean haveAnyText = false; - StringBuilder lineText = new StringBuilder(); + if ( WordToFoUtils.isNotEmpty( bulletText ) ) + { + Element inline = createInline(); + block.appendChild( inline ); - if (WordToFoUtils.isNotEmpty(bulletText)) { - Element inline = createInline(); - block.appendChild(inline); + Text textNode = createText( bulletText ); + inline.appendChild( textNode ); - Text textNode = createText(bulletText); - inline.appendChild(textNode); + haveAnyText |= bulletText.trim().length() != 0; + } - lineText.append(bulletText); + haveAnyText = processCharacters( hwpfDocument, currentTableLevel, + paragraph, block, 0, charRuns ); + + if ( !haveAnyText ) + { + Element leader = createLeader(); + block.appendChild( leader ); + } + } + finally + { + blocksProperies.pop(); } - for (int c = 0; c < charRuns; c++) { - CharacterRun characterRun = paragraph.getCharacterRun(c); + return; + } - if (hwpfDocument.getPicturesTable().hasPicture(characterRun)) { - Picture picture = hwpfDocument.getPicturesTable() - .extractPicture(characterRun, true); + protected void processSection( HWPFDocument hwpfDocument, Section section, + int sectionCounter ) + { + String regularPage = createPageMaster( + WordToFoUtils.getSectionProperties( section ), "page", + sectionCounter ); - processImage(block, characterRun.text().charAt(0) == 0x01, - picture); + Element pageSequence = addPageSequence( regularPage ); + Element flow = addFlowToPageSequence( pageSequence, "xsl-region-body" ); + + processSectionParagraphes( hwpfDocument, flow, section, 0 ); + } + + protected void processSectionParagraphes( HWPFDocument hwpfDocument, + Element flow, Range range, int currentTableLevel ) + { + final Map allTables = new HashMap(); + for ( TableIterator tableIterator = WordToFoUtils.newTableIterator( + range, currentTableLevel + 1 ); tableIterator.hasNext(); ) + { + Table next = tableIterator.next(); + allTables.put( Integer.valueOf( next.getStartOffset() ), next ); + } + + final ListTables listTables = hwpfDocument.getListTables(); + int currentListInfo = 0; + + final int paragraphs = range.numParagraphs(); + for ( int p = 0; p < paragraphs; p++ ) + { + Paragraph paragraph = range.getParagraph( p ); + + if ( allTables.containsKey( Integer.valueOf( paragraph + .getStartOffset() ) ) ) + { + Table table = allTables.get( Integer.valueOf( paragraph + .getStartOffset() ) ); + processTable( hwpfDocument, flow, table, currentTableLevel + 1 ); continue; } - String text = characterRun.text(); - if (text.getBytes().length == 0) - continue; + if ( paragraph.isInTable() + && paragraph.getTableLevel() != currentTableLevel ) + { + continue; + } - if (text.getBytes()[0] == FIELD_BEGIN_MARK) { - /* - * check if we have a field with calculated image as a result. - * MathType equation, for example. - */ - int skipTo = tryImageWithinField(hwpfDocument, paragraph, c, - block); + if ( paragraph.getIlfo() != currentListInfo ) + { + currentListInfo = paragraph.getIlfo(); + } - if (skipTo != c) { - c = skipTo; - continue; - } - continue; - } - if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) { - continue; - } - if (text.getBytes()[0] == FIELD_END_MARK) { - continue; - } + if ( currentListInfo != 0 ) + { + final ListFormatOverride listFormatOverride = listTables + .getOverride( paragraph.getIlfo() ); - if (characterRun.isSpecialCharacter() || characterRun.isObj() - || characterRun.isOle2()) { - continue; - } + String label = WordToFoUtils.getBulletText( listTables, + paragraph, listFormatOverride.getLsid() ); - Element inline = createInline(); - if (characterRun.isBold() != pBold) { - WordToFoUtils.setBold(inline, characterRun.isBold()); - } - if (characterRun.isItalic() != pItalic) { - WordToFoUtils.setItalic(inline, characterRun.isItalic()); - } - if (!WordToFoUtils.equals(characterRun.getFontName(), pFontName)) { - WordToFoUtils.setFontFamily(inline, characterRun.getFontName()); - } - if (characterRun.getFontSize() / 2 != pFontSize) { - WordToFoUtils.setFontSize(inline, - characterRun.getFontSize() / 2); - } - WordToFoUtils.setCharactersProperties(characterRun, inline); - block.appendChild(inline); - - if (text.endsWith("\r") - || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != 0)) - text = text.substring(0, text.length() - 1); - - Text textNode = createText(text); - inline.appendChild(textNode); - - lineText.append(text); - } - - if (lineText.toString().trim().length() == 0) { - Element leader = createLeader(); - block.appendChild(leader); - } - - return; - } - - protected void processSection(HWPFDocument hwpfDocument, Section section, - int sectionCounter) { - String regularPage = createPageMaster( - WordToFoUtils.getSectionProperties(section), "page", - sectionCounter); - - Element pageSequence = addPageSequence(regularPage); - Element flow = addFlowToPageSequence(pageSequence, "xsl-region-body"); - - processSectionParagraphes(hwpfDocument, flow, section, 0); - } - - protected void processSectionParagraphes(HWPFDocument hwpfDocument, - Element flow, Range range, int currentTableLevel) { - final Map allTables = new HashMap(); - for (TableIterator tableIterator = WordToFoUtils.newTableIterator( - range, currentTableLevel + 1); tableIterator.hasNext();) { - Table next = tableIterator.next(); - allTables.put(Integer.valueOf(next.getStartOffset()), next); - } - - final ListTables listTables = hwpfDocument.getListTables(); - int currentListInfo = 0; - - final int paragraphs = range.numParagraphs(); - for (int p = 0; p < paragraphs; p++) { - Paragraph paragraph = range.getParagraph(p); - - if (allTables.containsKey(Integer.valueOf(paragraph - .getStartOffset()))) { - Table table = allTables.get(Integer.valueOf(paragraph - .getStartOffset())); - processTable(hwpfDocument, flow, table, currentTableLevel + 1); - continue; - } - - if (paragraph.isInTable() - && paragraph.getTableLevel() != currentTableLevel) { - continue; - } - - if (paragraph.getIlfo() != currentListInfo) { - currentListInfo = paragraph.getIlfo(); - } - - if (currentListInfo != 0) { - final ListFormatOverride listFormatOverride = listTables - .getOverride(paragraph.getIlfo()); - - String label = WordToFoUtils.getBulletText(listTables, - paragraph, listFormatOverride.getLsid()); - - processParagraph(hwpfDocument, flow, currentTableLevel, - paragraph, label); - } else { - processParagraph(hwpfDocument, flow, currentTableLevel, - paragraph, WordToFoUtils.EMPTY); - } - } + processParagraph( hwpfDocument, flow, currentTableLevel, + paragraph, label ); + } + else + { + processParagraph( hwpfDocument, flow, currentTableLevel, + paragraph, WordToFoUtils.EMPTY ); + } + } } - protected void processTable(HWPFDocument hwpfDocument, Element flow, - Table table, int thisTableLevel) { - Element tableElement = addTable(flow); + protected void processTable( HWPFDocument hwpfDocument, Element flow, + Table table, int thisTableLevel ) + { + Element tableElement = addTable( flow ); - Element tableHeader = createTableHeader(); - Element tableBody = createTableBody(); + Element tableHeader = createTableHeader(); + Element tableBody = createTableBody(); - final int tableRows = table.numRows(); + final int tableRows = table.numRows(); - int maxColumns = Integer.MIN_VALUE; - for (int r = 0; r < tableRows; r++) { - maxColumns = Math.max(maxColumns, table.getRow(r).numCells()); - } + int maxColumns = Integer.MIN_VALUE; + for ( int r = 0; r < tableRows; r++ ) + { + maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() ); + } - for (int r = 0; r < tableRows; r++) { - TableRow tableRow = table.getRow(r); + for ( int r = 0; r < tableRows; r++ ) + { + TableRow tableRow = table.getRow( r ); - Element tableRowElement = createTableRow(); - WordToFoUtils.setTableRowProperties(tableRow, tableRowElement); + Element tableRowElement = createTableRow(); + WordToFoUtils.setTableRowProperties( tableRow, tableRowElement ); - final int rowCells = tableRow.numCells(); - for (int c = 0; c < rowCells; c++) { - TableCell tableCell = tableRow.getCell(c); + final int rowCells = tableRow.numCells(); + for ( int c = 0; c < rowCells; c++ ) + { + TableCell tableCell = tableRow.getCell( c ); - if (tableCell.isMerged() && !tableCell.isFirstMerged()) - continue; + if ( tableCell.isMerged() && !tableCell.isFirstMerged() ) + continue; - if (tableCell.isVerticallyMerged() - && !tableCell.isFirstVerticallyMerged()) - continue; + if ( tableCell.isVerticallyMerged() + && !tableCell.isFirstVerticallyMerged() ) + continue; - Element tableCellElement = createTableCell(); - WordToFoUtils.setTableCellProperties(tableRow, tableCell, - tableCellElement, r == 0, r == tableRows - 1, c == 0, - c == rowCells - 1); + Element tableCellElement = createTableCell(); + WordToFoUtils.setTableCellProperties( tableRow, tableCell, + tableCellElement, r == 0, r == tableRows - 1, c == 0, + c == rowCells - 1 ); - if (tableCell.isFirstMerged()) { - int count = 0; - for (int c1 = c; c1 < rowCells; c1++) { - TableCell nextCell = tableRow.getCell(c1); - if (nextCell.isMerged()) - count++; - if (!nextCell.isMerged()) - break; - } - tableCellElement.setAttribute("number-columns-spanned", "" - + count); - } else { - if (c == rowCells - 1 && c != maxColumns - 1) { - tableCellElement.setAttribute("number-columns-spanned", - "" + (maxColumns - c)); - } - } + if ( tableCell.isFirstMerged() ) + { + int count = 0; + for ( int c1 = c; c1 < rowCells; c1++ ) + { + TableCell nextCell = tableRow.getCell( c1 ); + if ( nextCell.isMerged() ) + count++; + if ( !nextCell.isMerged() ) + break; + } + tableCellElement.setAttribute( "number-columns-spanned", "" + + count ); + } + else + { + if ( c == rowCells - 1 && c != maxColumns - 1 ) + { + tableCellElement + .setAttribute( "number-columns-spanned", "" + + (maxColumns - c) ); + } + } - if (tableCell.isFirstVerticallyMerged()) { - int count = 0; - for (int r1 = r; r1 < tableRows; r1++) { - TableRow nextRow = table.getRow(r1); - if (nextRow.numCells() < c) - break; - TableCell nextCell = nextRow.getCell(c); - if (nextCell.isVerticallyMerged()) - count++; - if (!nextCell.isVerticallyMerged()) - break; - } - tableCellElement.setAttribute("number-rows-spanned", "" - + count); - } + if ( tableCell.isFirstVerticallyMerged() ) + { + int count = 0; + for ( int r1 = r; r1 < tableRows; r1++ ) + { + TableRow nextRow = table.getRow( r1 ); + if ( nextRow.numCells() < c ) + break; + TableCell nextCell = nextRow.getCell( c ); + if ( nextCell.isVerticallyMerged() ) + count++; + if ( !nextCell.isVerticallyMerged() ) + break; + } + tableCellElement.setAttribute( "number-rows-spanned", "" + + count ); + } - processSectionParagraphes(hwpfDocument, tableCellElement, - tableCell, thisTableLevel); + processSectionParagraphes( hwpfDocument, tableCellElement, + tableCell, thisTableLevel ); - if (!tableCellElement.hasChildNodes()) { - tableCellElement.appendChild(createBlock()); - } + if ( !tableCellElement.hasChildNodes() ) + { + tableCellElement.appendChild( createBlock() ); + } - tableRowElement.appendChild(tableCellElement); - } + tableRowElement.appendChild( tableCellElement ); + } - if (tableRow.isTableHeader()) { - tableHeader.appendChild(tableRowElement); - } else { - tableBody.appendChild(tableRowElement); - } - } + if ( tableRow.isTableHeader() ) + { + tableHeader.appendChild( tableRowElement ); + } + else + { + tableBody.appendChild( tableRowElement ); + } + } - if (tableHeader.hasChildNodes()) { - tableElement.appendChild(tableHeader); - } - if (tableBody.hasChildNodes()) { - tableElement.appendChild(tableBody); - } else { - System.err.println("Table without body"); - } + if ( tableHeader.hasChildNodes() ) + { + tableElement.appendChild( tableHeader ); + } + if ( tableBody.hasChildNodes() ) + { + tableElement.appendChild( tableBody ); + } + else + { + logger.log( + POILogger.WARN, + "Table without body starting on offset " + + table.getStartOffset() + " -- " + + table.getEndOffset() ); + } } - protected int tryImageWithinField(HWPFDocument hwpfDocument, - Paragraph paragraph, int beginMark, Element currentBlock) { + protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph, + int currentTableLevel, int beginMark, Element currentBlock ) + { int separatorMark = -1; - int pictureMark = -1; - int pictureChar = Integer.MIN_VALUE; int endMark = -1; - for (int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++) { - CharacterRun characterRun = paragraph.getCharacterRun(c); + for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ ) + { + CharacterRun characterRun = paragraph.getCharacterRun( c ); String text = characterRun.text(); - if (text.getBytes().length == 0) + if ( text.getBytes().length == 0 ) continue; - if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) { - if (separatorMark != -1) { + if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) + { + if ( separatorMark != -1 ) + { // double; return beginMark; } @@ -604,8 +724,10 @@ public class WordToFoExtractor { continue; } - if (text.getBytes()[0] == FIELD_END_MARK) { - if (endMark != -1) { + if ( text.getBytes()[0] == FIELD_END_MARK ) + { + if ( endMark != -1 ) + { // double; return beginMark; } @@ -614,63 +736,14 @@ public class WordToFoExtractor { break; } - if (hwpfDocument.getPicturesTable().hasPicture(characterRun)) { - if (c != -1) { - // double; - return beginMark; - } - - pictureMark = c; - pictureChar = characterRun.text().charAt(0); - continue; - } } - if (separatorMark == -1 || pictureMark == -1 || endMark == -1) + if ( separatorMark == -1 || endMark == -1 ) return beginMark; - final CharacterRun pictureRun = paragraph.getCharacterRun(pictureMark); - final Picture picture = hwpfDocument.getPicturesTable().extractPicture( - pictureRun, true); - - processImage(currentBlock, pictureChar == 0x01, picture); + processField( hwpfDocument, currentBlock, paragraph, currentTableLevel, + beginMark, separatorMark, endMark ); return endMark; } - - /** - * Java main() interface to interact with WordToFoExtractor - * - *

- * Usage: WordToFoExtractor infile outfile - *

- * Where infile is an input .doc file ( Word 97-2007) - * which will be rendered as XSL-FO into outfile - * - */ - public static void main(String[] args) { - if (args.length < 2) { - System.err.println("Usage: WordToFoExtractor "); - return; - } - - System.out.println("Converting " + args[0]); - System.out.println("Saving output to " + args[1]); - try { - Document doc = WordToFoExtractor.process(new File(args[0])); - - FileWriter out = new FileWriter(args[1]); - DOMSource domSource = new DOMSource(doc); - StreamResult streamResult = new StreamResult(out); - TransformerFactory tf = TransformerFactory.newInstance(); - Transformer serializer = tf.newTransformer(); - serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); // TODO set encoding from a command argument - serializer.setOutputProperty(OutputKeys.INDENT, "yes"); - serializer.transform(domSource, streamResult); - out.close(); - } catch (Exception e) { - e.printStackTrace(); - } - } - } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractor.java b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractor.java new file mode 100644 index 0000000000..8bcd5bb21c --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordToFoExtractor.java @@ -0,0 +1,95 @@ +/* + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ +package org.apache.poi.hwpf.extractor; + +import java.io.StringWriter; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import junit.framework.TestCase; +import org.apache.poi.POIDataSamples; +import org.apache.poi.hwpf.HWPFDocument; + +/** + * Test cases for {@link WordToFoExtractor} + * + * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) + */ +public class TestWordToFoExtractor extends TestCase +{ + private static String getFoText( final String sampleFileName ) + throws Exception + { + HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples + .getDocumentInstance().openResourceAsStream( sampleFileName ) ); + + WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( + DocumentBuilderFactory.newInstance().newDocumentBuilder() + .newDocument() ); + wordToFoExtractor.processDocument( hwpfDocument ); + + StringWriter stringWriter = new StringWriter(); + + Transformer transformer = TransformerFactory.newInstance() + .newTransformer(); + transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); + transformer.transform( + new DOMSource( wordToFoExtractor.getDocument() ), + new StreamResult( stringWriter ) ); + + String result = stringWriter.toString(); + return result; + } + + public void testHyperlink() throws Exception + { + final String sampleFileName = "hyperlink.doc"; + String result = getFoText( sampleFileName ); + + assertTrue( result + .contains( "" ) ); + assertTrue( result.contains( "Hyperlink text" ) ); + } + + public void testEquation() throws Exception + { + final String sampleFileName = "equation.doc"; + String result = getFoText( sampleFileName ); + + assertTrue( result + .contains( "" ) ); + } + + public void testPageref() throws Exception + { + final String sampleFileName = "pageref.doc"; + String result = getFoText( sampleFileName ); + + System.out.println( result ); + + assertTrue( result + .contains( "" ) ); + assertTrue( result.contains( "1" ) ); + } +} diff --git a/test-data/document/equation.doc b/test-data/document/equation.doc new file mode 100644 index 0000000000000000000000000000000000000000..e1bda06de227dc9f5112a16aa8d9f36c0c8454c4 GIT binary patch literal 13824 zcmeHN30PCd+CC=)2%APg+)x5Wv7kXf5f@ew5JW)4<%(bkp+X@+2$q82R!aW1^#zV5*w+q`1iU0?SJ|5yqP((eakH8J2O-E zQ)TqWLElNWrokrY9`OKv#7UBerO-)V8WJEjYZ)AZ6g{)oBfown> zAbqY0Z~!ep8_)rC0X?7-&>6rGp!5OK{{{#Rfv$?@ZV0;rJrvJ>Ba{4Bi8_Ij;wPhc z)EG*FFqZlZJ=S(I!iZ5dSX*kUt`}`7-&&at{(o4YHCvHBC&b*vHE zDB?p94h8IhVSqg_92fzN1V#Z4fFs}pI0G(#D=->x1I7UEfCu0Sj0L;^Z(tnY1B?fJ z0YAVWm;eL-6M;Zr5-=H<0!#(|0ZapefayRm5CY5qgg_`D0>S_>5Dt6{%mgBUNZ=EI z=o_sFNt|F7o@0SHKnBDEvw=^61c1!uL?8*610(|}z+7M+K%-SPEbwTmNkfZ;6pA{* z{8R2^{q~6v#Y*F(;WFN2X>8aaH|Zzw(PD{=c_;V;kTf@`h zPg_y07L>1-8s!g+_UcZ>dG=rv_<6A@i6@&A?4?c#ksjhXoGk<1A%-jE;{3GfpUfEQ zfzcF-o)DrZ5;0N(5DKxXhT%UJ|8eN$nTSQ8he=K>(_?=$>P0o0&=iX%<4ud|2Dxn^ z_ee%|NA?J?*MpV@Idzm_P6DODr}2jM_0oX&8W7)@B4W!$HpbX@<&w6@9Oy*#VY{#$ zn0Cnja5j%}We-J*E^Jq{s*V0L$ie#?tx^=9vf|RTKFvp5gED11vF%Y#&YwkfftL{M zr3rnBtY$QsI~=MvV`CfqJD~JXMpAvWnZUHE3rY1E1KV+EQkmFvI)@f8ZP*|r`u2r9 zq^AZ$rlHV!99mJwxgPa|D*4ZPidCAMfDO1o5e3TQdXYAop-K$uX2I5p>LiVGzX9R98 zhN(^0Zzf*M(aZd%3tvBU@rDACSlcL6l*YD}dFqbrVrgmlgq5K2rRmp|UFta3<_7l8 z7u?97c_Hi73a9v*Y}w5_xp5Q5CQ&~>$&Wr_I{0|5Mr2>-nzH+YZ{M-gG`8;ZoyViH zbeYcdz?CB_)PG!)R~er={$h~P)WqPC+p0T%RJCm5(u-q$b3ZmoZ~ET74o7_U);f7S zUhyL1cfP~jT@~jXyJ>${?NC0e>8ndqPHBAgGG$Nq%0nfy^rtNwG{azg(Ynn=bH|U| zTG>0H+Gfu&_YmvB&U@V|ihj%GSC_cYcx)YF@Qju0eX!=a&cqlej_9z{iyws_-=4bQ z;a2sMM)N|U)5$XiB_o$#E%m6+(h=3peIZ)8xZ&e?tDYVkzaM73NbPAb|M`s7vZvyc zjkf|0+&sI^WYF%UGmE~iO}%D*{M4=TZRliy*@cQ(z`yTh}cDl%9dGaTkbnD;IBClnP?EDUg7AHDl_GDV54X{@4^N9s zG5zE6RQaNUsYg7nPSj|jG)gG3m#4m|MNnB|M z5^qjlFbNZgsvu=3e|q;nDI@nj-<_{a4M({lcvy#!Sb4i5ROUaxa{*8YkT3BMrBc-Y z5Lyd|H&mtWWyHy@U9bmut2~oIPGFuAYq#UeKRC%K49yI4Rjl-(>p%=RQRc$e_hBfzZj`) z=abL#m>=_L`5a@Elg5Z8Vah`-AE9jKn-eZ;=G1FUbf`EiOdQ7Z4UH6wWGXGUEsAeR zm2aiC)oEUe>;n<{^A4Ciy`}&8$Xo!BeJ!%9wFpQB76V@ZOMounkfFvq9B3dl_EJiGhiHV9331tydiPdaLm_RCw zi-;4vH8;YgvC;8SLV0@}4u@$EO}rs8NJUbK45!C}gy^V6nJ&I+rrbrfjcHn%R_RO2 z*X$RRtv64au-9k6sp(md%T5eQ->+?t>?fLro|BI!^Da#2J?1gA%u#+_Q4q%I!hJtm9*gf&};04cb&#)XiP&G$rSD zq5tT#D$n1}&L7%6mtSLl=NEpBh3ocvXP(#oW7PHJ`r>m5J4Gj;*~Odu=c`wf41P zp9=F|%l4;hmfkDdf6n%Eh5yUGMV zo5pQ)@l5-+#?H4_zS^i0J*t-GCpjHb`Tiv_dLnAUbuk2&i+hK#Z z(`Ps1Mc*a!v%EH~;O6XQne@H=U`W&RnR9i#H|F)I@}2(au8H@OZbcp#G5vtA|G;m5 zefje6!Q}Ow^fG>WaQE2GuxFnbnmu+2cfEVg=;z2Gj)`YJ{pITLmli_?9bA3dbX7&o zL+VV4Rv>rX75k$lrw`=kW;WU`DtC;O{QT|l>QVQftY($6`wmOqYL}UEGQ;XdFt&Wx zCmg>@pIfDO^h??HuhynsPp{t*`OR^)=>zv2bNnRv$8R<~Da$K<{mY=wxH?;8M!%nB z1=}3C-QS?I@mJo4KRgdfJVt89xi z>SFEHu4&|3?ak<&Dorb$7jtl!`8ZdxYe*M&wQ0-NXog#jJ?a%aU{ZOM!3goy@s!%p zr6+eeW;JR?vTC+!ix&Ro{LC)frohT#$%gXpuiNHr->@rUl9*2&?K$rJeoHgcDS<+} zC4pVEPV7!?(AaNruj{rc>H4$Ao*PEj$&T?u^XzmZ9|>Q@8gn&PFPpBVKWhGnyh-t& z@3J}{*?HFN`$@e|`VokU`Xm0z&8Q`bG` z)A`O(mM*it&L1H1eAc6|$lYl1w|?V~8kMd0|77ykoH38ipWLx!-O7NS+alA_xq+se zw=_hLe(~d_BNn?o-OG=Ry7hTL*ytZ3x-T|bI=T45qbZ)>pEj#@Ov{rbR~}mBp0L1w zNr-RU+%5IX8>8x;Hl{vkpvl~ zf`ej81zD*+$28`LiK_4Wo_vt7b9RArvd`dqiRbb*E*g-uGGtHcEM8Zi*;6yM)^wfL z)nG!mN8K}~9IGk6cRBxi-NzeBc5@c5Uvs`7Zu{M7u6q6Z)#lyF3JBhrYZCnY;+Dsi zTzBaVje(u63r;uOPhO?FX{cvL$f+6M77jf4yYIft1TPz{lr!f`gB-Iq-F8nIm2*su zm_fj+^86l3o@JIOnq^2fq2Kd5Krt0kz8w6#<+YTHzUrc<@Jtl7;E}1m0-9G;?UA>4 z6eA&=>I&Jsrm7Ne?=@>+H~O9Z=jW|X_V01#%%&b9)MK7VO1r3~^AlG2`SNT*FYb!f zI6oQqGM{*& z;TPdeNYQMI0g?5!ro4z7vnwWSv3yIJO8w>B+s2=CJvp~0q>jrdWS5RyO(J)X$6~iG z93vu@Ns-~Q4Y{E5ei{J%$;>6^jBVAQ%)>lIlam=Uoro+!GyeMh#c?Y*F23_c%mZlubTf#8#L$ju4giymGeJ_xtK=w ztjTTA2!wIC8ytstEaF1Uf|iMo@u!O-?cqa;w-P<2mF=1 xNhnWk`9B8PofYek`k!WA`_=>d58$5;{+=z1*sk36>aRNf+r{WTh5tAF{|AwF-pl|1 literal 0 HcmV?d00001 diff --git a/test-data/document/hyperlink.doc b/test-data/document/hyperlink.doc new file mode 100644 index 0000000000000000000000000000000000000000..5e64b25b2c76a802ffa258e3744eaca6940fe41d GIT binary patch literal 9728 zcmeI2OH5Q(7{|{YE;xf0hNpm7N2E$oK|vHm5JprOL=aT8K0&akFhgNTwJ{B`Y0^cT z*mTi0Y0}u(MIUKsn{?4lyJ?dy+PZP!&Mq}wRMIwa0n^`i?hH&R4i~VdH1m=FJ!j56 z=X;&kch2S3&uMpWe4h7<$-n;;4i2=YOnH>4yh~jsC~+2 z#6+>1S_firh+Z#Z)wBC9E`Qnsc6=7?|H%5ikohlrfz1DOkO4A57RUxGK@P|Td0-XD z2dlx;U=8qqwV(j31BF2JSWmhkoPUZjT#{nv8g+)!XR za6ahi3=H*bZU|f&8R+!~^`6kuA!QALb2=)Jd#?EQ+=)dp zd%VT2!9Jx%(L1D3TWt8ro&iDNQ^CkxF zJ0MXShVv4sNkk^Ap{#1MLS-Dw3}uay(Z3{h$U-e7uUWKcYz1}uv#&G%=BC38*#Xtd z0QN9Ad)14i5;zR#;mx88pUC8k#jS(jVt^?UQ2kUZUvE!(y`C~%x{YW&8cRg@-CP>C zCfVcDiM4(f`4MgDas1$63%I~U9>NL_TSPCv*iLNnJ3y;GT1ddvNGgkiwBK*q(`PQb zcH89hC_S1g?CH^f-52r6Q9R>ep_64!Mn?}Ap#&NtYTzGJBM8R3qe zN~{8z5E42V{3%+zqKDYuOCwD;Bq$G+q*|O7;em9C!yYruSWP@OJ89mUj4oF1cVW=5 zSSGiAxPRs0*FUAbeocL>nm<}D)+d0wGs=r7>tpQ*szHL!(pBydW|e$<_)Rm z^>$!Wr%6t1##H1q-AE?M-B;fqf7sgR{^U)k+PLA%JL27|K&F$tmsr3B>Hs06-d^OM z;22l|WM7rtX%L7>_a>~^U-H7M4n(Lfk|BLmKB~%8_SX*$qs8NrDV+X|RcZ}_@D7=x z_H+gMyZn__=~O$(xXf%-sP|;6ShmMl6bhy4-B)ave8dlupGmLjf)B)0D3V&gW`9Z@ zD_WTSeWdg&>TTo4eNyrApW!s}zKW4EMwx?DPGSqC92tQNpnC);-N=tnA$;Ui7^nZI z2V|VcIDFFNP}!y^fgE(?T)R~1aen_5pUnJ*G@D=KU^$v=y)pRmCKt(rd{OJ>N^!oF zP7g>Q$gQ}1!?N2xBFR8*Kl}SaDH-V^S8TE?U-XTf^tbZWunWr5o$oXe5G0_MeMhv+mrtc!sOmeD3QzK5&B;~CCL9CG;w`w#dA^;B<7L}%3O8kf9`T3)A M0rKTut}$%dU$s34jsO4v literal 0 HcmV?d00001 diff --git a/test-data/document/pageref.doc b/test-data/document/pageref.doc new file mode 100644 index 0000000000000000000000000000000000000000..c8a69772796b442a02b3ba29c6c411f91350b9b2 GIT binary patch literal 9728 zcmeI2OKenC7=X{6xoxKfN;@s(Ekh{}3zR;g<=GB}mVwgJ0tFFp`bvv*2B&QV;{q`z zE;PZoFfk^E1WkOzRNT4I#AtM(vSGo-;8Nm3i7pz`j^BUoy=4kzdOIP+nmLp2o^#JV z_niO#&->iD`E&lAYo9OsRXyN#D6g7`XRFMFc!O}Bb5FoSSmGw)@wgLBFzx1$1;%Yz zr4CIhF$1K|v%m-0kOT8zJ}iJ-SP1e?zU|K;mD8doSfE`+`3D)T3IebiieU|i3~OzFJ^lteT#CODHo;~ngDp@F6;KIPPz^P(6}CYw)WLSx0XtzA z?1nv14-F86z0e5zU_UfLGaP^r9E3yA0*9d$+TaMZLkAp%V{jZg;RKw7XW$e(3#Z`> zoQ3Bg45D|p?ccA-ClfL#-OB(krDbNzjIPwz`l|Lz{k05q$D%{gzTrS;G}g1ZG5Y+- zU~gnt&j~FZ64w~*)~f;umvN(?m({Aj>TbMXWt8k{Sj&iHNs>PE)8cYig`8qWvZU8` z&-*j((e}?$`DVGf&D?EPng`7~rZ}0^DoRvoiLpfmOI2>B%0l&=NJ&ywnuhLZ*r| zo4zxl3>7B+Y;&DC4AV#wR1Nj_;@5N=T1so`QbW|~uzHD>+JPTtRPEs!<2t0eRX?Et z)u*4txF))gnpkBhk0I?XTP;TJD&*dwW#4UXr}zbES#bRiRe1fP$}BPHVXdJ|#Lqp%&oKN( z3E@n&+T3H-lFlmMLw}btvd~v#EjFa@_|!^m<4Uf(NL`~P^^@mCo#%Wc^=kuf^%?TA z*z~l|DARdV;)}j(kVk5&9GPm+`VhHfFs{hkR+&9IebDG3{Ss9?sX;(hqLI8=ttAj0 zcM{WxUr)%&QS>ad^_2HpLq06$L#{$BmyK3F1d}mLDH(kzUKTpKXNW&*M?*Ud8SFH3?!-EbLB$;atd$ujF;*)T}skE-(|t6Jx-^akA- z!ulPN_;Ru$>}NAc+?wewpH8%mFno__O;2Kn0E7DgT{MOk0S5nG?wBZC=68s<`guc^ zQqB0%?dAO@)15ps>9xmYK97>)K#e;&8qxb4HaU)E0t{g?fJy6ULB9wv^|{Q+t%ZYh zHN*EN{oz}hTEAge^rUA+NGp^L2!k3|Q@wgs{Hd{WCvLqLfyL2>(=K%j_+^Fln4MEc zcD4bRl{9NfF1wR^h-8iKl0Gt(@8%D8FO7ZqQ~n!Q)W_=JNApBS8H$Ce1NuXGOt53J zn)K91(&+EJ_Wr%G*8aRt-twu9>%X`yHh39i7Ln&Nx6A $3QZ^{MRBPk?k4nI~nx zegQC+MJ>%u7z2WHjaHQ6X zoqSIM*=CU{(Q~X7?aXRZ5+&Tk;|uj{D!NP9v4eykbg%1(c2O1Q_}1O@S4m^d{?k8q zV*5x}&CR>`V&xBQ-+5l(Rce&D8zA%iOzwL--7j`Nck!j++`~`BeZzvIE8*PsO9yS?6Y?Wl2B&Q_4Ag3SW>hEzbX*cIBu+Zas z#d}WF$n+VEvqZ<$^Hv(f))vUSeAG9D^