bug 51351: more progress with WordToFoExtractor: support for hyperlinks, common fields and code cleanup

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1137673 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yegor Kozlov 2011-06-20 15:56:28 +00:00
parent 49448123e1
commit 21885a6fd5
6 changed files with 867 additions and 493 deletions

View File

@ -0,0 +1,206 @@
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/
package org.apache.poi.hwpf.extractor;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
public abstract class AbstractToFoExtractor
{
private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
protected final Document document;
protected final Element layoutMasterSet;
protected final Element root;
public AbstractToFoExtractor( Document document )
{
this.document = document;
root = document.createElementNS( NS_XSLFO, "fo:root" );
document.appendChild( root );
layoutMasterSet = document.createElementNS( NS_XSLFO,
"fo:layout-master-set" );
root.appendChild( layoutMasterSet );
}
protected Element addFlowToPageSequence( final Element pageSequence,
String flowName )
{
final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
flow.setAttribute( "flow-name", flowName );
pageSequence.appendChild( flow );
return flow;
}
protected Element addListItem( Element listBlock )
{
Element result = createListItem();
listBlock.appendChild( result );
return result;
}
protected Element addListItemBody( Element listItem )
{
Element result = createListItemBody();
listItem.appendChild( result );
return result;
}
protected Element addListItemLabel( Element listItem, String text )
{
Element result = createListItemLabel( text );
listItem.appendChild( result );
return result;
}
protected Element addPageSequence( String pageMaster )
{
final Element pageSequence = document.createElementNS( NS_XSLFO,
"fo:page-sequence" );
pageSequence.setAttribute( "master-reference", pageMaster );
root.appendChild( pageSequence );
return pageSequence;
}
protected Element addRegionBody( Element pageMaster )
{
final Element regionBody = document.createElementNS( NS_XSLFO,
"fo:region-body" );
pageMaster.appendChild( regionBody );
return regionBody;
}
protected Element addSimplePageMaster( String masterName )
{
final Element simplePageMaster = document.createElementNS( NS_XSLFO,
"fo:simple-page-master" );
simplePageMaster.setAttribute( "master-name", masterName );
layoutMasterSet.appendChild( simplePageMaster );
return simplePageMaster;
}
protected Element addTable( Element flow )
{
final Element table = document.createElementNS( NS_XSLFO, "fo:table" );
flow.appendChild( table );
return table;
}
protected Element createBasicLinkExternal( String externalDestination )
{
final Element basicLink = document.createElementNS( NS_XSLFO,
"fo:basic-link" );
basicLink.setAttribute( "external-destination", externalDestination );
return basicLink;
}
protected Element createBasicLinkInternal( String internalDestination )
{
final Element basicLink = document.createElementNS( NS_XSLFO,
"fo:basic-link" );
basicLink.setAttribute( "internal-destination", internalDestination );
return basicLink;
}
protected Element createBlock()
{
return document.createElementNS( NS_XSLFO, "fo:block" );
}
protected Element createExternalGraphic( String source )
{
Element result = document.createElementNS( NS_XSLFO,
"fo:external-graphic" );
result.setAttribute( "src", "url('" + source + "')" );
return result;
}
protected Element createInline()
{
return document.createElementNS( NS_XSLFO, "fo:inline" );
}
protected Element createLeader()
{
return document.createElementNS( NS_XSLFO, "fo:leader" );
}
protected Element createListBlock()
{
return document.createElementNS( NS_XSLFO, "fo:list-block" );
}
protected Element createListItem()
{
return document.createElementNS( NS_XSLFO, "fo:list-item" );
}
protected Element createListItemBody()
{
return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
}
protected Element createListItemLabel( String text )
{
Element result = document.createElementNS( NS_XSLFO,
"fo:list-item-label" );
Element block = createBlock();
block.appendChild( document.createTextNode( text ) );
result.appendChild( block );
return result;
}
protected Element createTableBody()
{
return document.createElementNS( NS_XSLFO, "fo:table-body" );
}
protected Element createTableCell()
{
return document.createElementNS( NS_XSLFO, "fo:table-cell" );
}
protected Element createTableHeader()
{
return document.createElementNS( NS_XSLFO, "fo:table-header" );
}
protected Element createTableRow()
{
return document.createElementNS( NS_XSLFO, "fo:table-row" );
}
protected Text createText( String data )
{
return document.createTextNode( data );
}
public Document getDocument()
{
return document;
}
}

View File

@ -16,7 +16,6 @@
* limitations under the License. * limitations under the License.
* ==================================================================== * ====================================================================
*/ */
package org.apache.poi.hwpf.extractor; package org.apache.poi.hwpf.extractor;
import java.io.File; import java.io.File;
@ -25,6 +24,9 @@ import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys; import javax.xml.transform.OutputKeys;
@ -46,6 +48,8 @@ import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell; import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator; import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow; import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.w3c.dom.Element; import org.w3c.dom.Element;
import org.w3c.dom.Text; import org.w3c.dom.Text;
@ -55,7 +59,30 @@ import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH;
/** /**
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com) * @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
*/ */
public class WordToFoExtractor { public class WordToFoExtractor extends AbstractToFoExtractor
{
/**
* Holds properties values, applied to current <tt>fo:block</tt> element.
* Those properties shall not be doubled in children <tt>fo:inline</tt>
* elements.
*/
private static class BlockProperies
{
final boolean pBold;
final String pFontName;
final int pFontSize;
final boolean pItalic;
public BlockProperies( String pFontName, int pFontSize, boolean pBold,
boolean pItalic )
{
this.pFontName = pFontName;
this.pFontSize = pFontSize;
this.pBold = pBold;
this.pItalic = pItalic;
}
}
private static final byte BEL_MARK = 7; private static final byte BEL_MARK = 7;
@ -65,22 +92,74 @@ public class WordToFoExtractor {
private static final byte FIELD_SEPARATOR_MARK = 20; private static final byte FIELD_SEPARATOR_MARK = 20;
private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format"; private static final POILogger logger = POILogFactory
.getLogger( WordToFoExtractor.class );
private static HWPFDocument loadDoc(File docFile) throws IOException { private static HWPFDocument loadDoc( File docFile ) throws IOException
{
final FileInputStream istream = new FileInputStream( docFile ); final FileInputStream istream = new FileInputStream( docFile );
try { try
{
return new HWPFDocument( istream ); return new HWPFDocument( istream );
} finally { }
try { finally
{
try
{
istream.close(); istream.close();
} catch (Exception exc) { }
// no op catch ( Exception exc )
{
logger.log( POILogger.ERROR,
"Unable to close FileInputStream: " + exc, exc );
} }
} }
} }
static Document process(File docFile) throws Exception { /**
* Java main() interface to interact with WordToFoExtractor
*
* <p>
* Usage: WordToFoExtractor infile outfile
* </p>
* Where infile is an input .doc file ( Word 97-2007) which will be rendered
* as XSL-FO into outfile
*
*/
public static void main( String[] args )
{
if ( args.length < 2 )
{
System.err
.println( "Usage: WordToFoExtractor <inputFile.doc> <saveTo.fo>" );
return;
}
System.out.println( "Converting " + args[0] );
System.out.println( "Saving output to " + args[1] );
try
{
Document doc = WordToFoExtractor.process( new File( args[0] ) );
FileWriter out = new FileWriter( args[1] );
DOMSource domSource = new DOMSource( doc );
StreamResult streamResult = new StreamResult( out );
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
// TODO set encoding from a command argument
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
serializer.transform( domSource, streamResult );
out.close();
}
catch ( Exception e )
{
e.printStackTrace();
}
}
static Document process( File docFile ) throws Exception
{
final HWPFDocument hwpfDocument = loadDoc( docFile ); final HWPFDocument hwpfDocument = loadDoc( docFile );
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor( WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder() DocumentBuilderFactory.newInstance().newDocumentBuilder()
@ -89,123 +168,24 @@ public class WordToFoExtractor {
return wordToFoExtractor.getDocument(); return wordToFoExtractor.getDocument();
} }
private final Document document; private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
private final Element layoutMasterSet; /**
* Creates new instance of {@link WordToFoExtractor}. Can be used for output
private final Element root; * several {@link HWPFDocument}s into single FO document.
*
public WordToFoExtractor(Document document) throws Exception { * @param document
this.document = document; * XML DOM Document used as XSL FO document. Shall support
* namespaces
root = document.createElementNS(NS_XSLFO, "fo:root"); */
document.appendChild(root); public WordToFoExtractor( Document document )
{
layoutMasterSet = document.createElementNS(NS_XSLFO, super( document );
"fo:layout-master-set");
root.appendChild(layoutMasterSet);
}
protected Element addFlowToPageSequence(final Element pageSequence,
String flowName) {
final Element flow = document.createElementNS(NS_XSLFO, "fo:flow");
flow.setAttribute("flow-name", flowName);
pageSequence.appendChild(flow);
return flow;
}
protected Element addListItem(Element listBlock) {
Element result = createListItem();
listBlock.appendChild(result);
return result;
}
protected Element addListItemBody(Element listItem) {
Element result = createListItemBody();
listItem.appendChild(result);
return result;
}
protected Element addListItemLabel(Element listItem, String text) {
Element result = createListItemLabel(text);
listItem.appendChild(result);
return result;
}
protected Element addPageSequence(String pageMaster) {
final Element pageSequence = document.createElementNS(NS_XSLFO,
"fo:page-sequence");
pageSequence.setAttribute("master-reference", pageMaster);
root.appendChild(pageSequence);
return pageSequence;
}
protected Element addRegionBody(Element pageMaster) {
final Element regionBody = document.createElementNS(NS_XSLFO,
"fo:region-body");
pageMaster.appendChild(regionBody);
return regionBody;
}
protected Element addSimplePageMaster(String masterName) {
final Element simplePageMaster = document.createElementNS(NS_XSLFO,
"fo:simple-page-master");
simplePageMaster.setAttribute("master-name", masterName);
layoutMasterSet.appendChild(simplePageMaster);
return simplePageMaster;
}
protected Element addTable(Element flow) {
final Element table = document.createElementNS(NS_XSLFO, "fo:table");
flow.appendChild(table);
return table;
}
protected Element createBlock() {
return document.createElementNS(NS_XSLFO, "fo:block");
}
protected Element createExternalGraphic(String source) {
Element result = document.createElementNS(NS_XSLFO,
"fo:external-graphic");
result.setAttribute("src", "url('" + source + "')");
return result;
}
protected Element createInline() {
return document.createElementNS(NS_XSLFO, "fo:inline");
}
protected Element createLeader() {
return document.createElementNS(NS_XSLFO, "fo:leader");
}
protected Element createListBlock() {
return document.createElementNS(NS_XSLFO, "fo:list-block");
}
protected Element createListItem() {
return document.createElementNS(NS_XSLFO, "fo:list-item");
}
protected Element createListItemBody() {
return document.createElementNS(NS_XSLFO, "fo:list-item-body");
}
protected Element createListItemLabel(String text) {
Element result = document.createElementNS(NS_XSLFO,
"fo:list-item-label");
Element block = createBlock();
block.appendChild(document.createTextNode(text));
result.appendChild(block);
return result;
} }
protected String createPageMaster( SectionProperties sep, String type, protected String createPageMaster( SectionProperties sep, String type,
int section) { int section )
{
float height = sep.getYaPage() / TWIPS_PER_INCH; float height = sep.getYaPage() / TWIPS_PER_INCH;
float width = sep.getXaPage() / TWIPS_PER_INCH; float width = sep.getXaPage() / TWIPS_PER_INCH;
float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH; float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
@ -234,12 +214,17 @@ public class WordToFoExtractor {
// WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left"); // WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left");
// WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right"); // WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right");
if (sep.getCcolM1() > 0) { if ( sep.getCcolM1() > 0 )
regionBody.setAttribute("column-count", "" + (sep.getCcolM1() + 1)); {
if (sep.getFEvenlySpaced()) { regionBody
.setAttribute( "column-count", "" + (sep.getCcolM1() + 1) );
if ( sep.getFEvenlySpaced() )
{
regionBody.setAttribute( "column-gap", regionBody.setAttribute( "column-gap",
(sep.getDxaColumns() / TWIPS_PER_INCH) + "in" ); (sep.getDxaColumns() / TWIPS_PER_INCH) + "in" );
} else { }
else
{
regionBody.setAttribute( "column-gap", "0.25in" ); regionBody.setAttribute( "column-gap", "0.25in" );
} }
} }
@ -247,38 +232,173 @@ public class WordToFoExtractor {
return pageMasterName; return pageMasterName;
} }
protected Element createTableBody() { protected boolean processCharacters( HWPFDocument hwpfDocument,
return document.createElementNS(NS_XSLFO, "fo:table-body"); int currentTableLevel, Paragraph paragraph, final Element block,
final int start, final int end )
{
boolean haveAnyText = false;
for ( int c = start; c < end; c++ )
{
CharacterRun characterRun = paragraph.getCharacterRun( c );
if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) )
{
Picture picture = hwpfDocument.getPicturesTable()
.extractPicture( characterRun, true );
processImage( block, characterRun.text().charAt( 0 ) == 0x01,
picture );
continue;
} }
protected Element createTableCell() { String text = characterRun.text();
return document.createElementNS(NS_XSLFO, "fo:table-cell"); if ( text.getBytes().length == 0 )
continue;
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
{
int skipTo = tryField( hwpfDocument, paragraph,
currentTableLevel, c, block );
if ( skipTo != c )
{
c = skipTo;
continue;
} }
protected Element createTableHeader() { continue;
return document.createElementNS(NS_XSLFO, "fo:table-header"); }
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
{
// shall not appear without FIELD_BEGIN_MARK
continue;
}
if ( text.getBytes()[0] == FIELD_END_MARK )
{
// shall not appear without FIELD_BEGIN_MARK
continue;
} }
protected Element createTableRow() { if ( characterRun.isSpecialCharacter() || characterRun.isObj()
return document.createElementNS(NS_XSLFO, "fo:table-row"); || characterRun.isOle2() )
{
continue;
} }
protected Text createText(String data) { BlockProperies blockProperies = this.blocksProperies.peek();
return document.createTextNode(data); Element inline = createInline();
if ( characterRun.isBold() != blockProperies.pBold )
{
WordToFoUtils.setBold( inline, characterRun.isBold() );
}
if ( characterRun.isItalic() != blockProperies.pItalic )
{
WordToFoUtils.setItalic( inline, characterRun.isItalic() );
}
if ( !WordToFoUtils.equals( characterRun.getFontName(),
blockProperies.pFontName ) )
{
WordToFoUtils
.setFontFamily( inline, characterRun.getFontName() );
}
if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
{
WordToFoUtils.setFontSize( inline,
characterRun.getFontSize() / 2 );
}
WordToFoUtils.setCharactersProperties( characterRun, inline );
block.appendChild( inline );
if ( text.endsWith( "\r" )
|| (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) )
text = text.substring( 0, text.length() - 1 );
Text textNode = createText( text );
inline.appendChild( textNode );
haveAnyText |= text.trim().length() != 0;
} }
public Document getDocument() { return haveAnyText;
return document;
} }
public void processDocument(HWPFDocument hwpfDocument) { public void processDocument( HWPFDocument hwpfDocument )
{
final Range range = hwpfDocument.getRange(); final Range range = hwpfDocument.getRange();
for (int s = 0; s < range.numSections(); s++) { for ( int s = 0; s < range.numSections(); s++ )
{
processSection( hwpfDocument, range.getSection( s ), s ); processSection( hwpfDocument, range.getSection( s ), s );
} }
} }
protected void processField( HWPFDocument hwpfDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel,
int beginMark, int separatorMark, int endMark )
{
Pattern hyperlinkPattern = Pattern
.compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
Pattern pagerefPattern = Pattern
.compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
if ( separatorMark - beginMark > 1 )
{
CharacterRun firstAfterBegin = paragraph
.getCharacterRun( beginMark + 1 );
final Matcher hyperlinkMatcher = hyperlinkPattern
.matcher( firstAfterBegin.text() );
if ( hyperlinkMatcher.matches() )
{
String hyperlink = hyperlinkMatcher.group( 1 );
processHyperlink( hwpfDocument, currentBlock, paragraph,
currentTableLevel, hyperlink, separatorMark + 1,
endMark );
return;
}
final Matcher pagerefMatcher = pagerefPattern
.matcher( firstAfterBegin.text() );
if ( pagerefMatcher.matches() )
{
String pageref = pagerefMatcher.group( 1 );
processPageref( hwpfDocument, currentBlock, paragraph,
currentTableLevel, pageref, separatorMark + 1, endMark );
return;
}
}
StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
for ( int i = beginMark; i <= endMark; i++ )
{
debug.append( "\t" );
debug.append( paragraph.getCharacterRun( i ) );
debug.append( "\n" );
}
logger.log( POILogger.WARN, debug );
// just output field value
if ( separatorMark + 1 < endMark )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
currentBlock, separatorMark + 1, endMark );
return;
}
protected void processHyperlink( HWPFDocument hwpfDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel,
String hyperlink, int beginTextInclusive, int endTextExclusive )
{
Element basicLink = createBasicLinkExternal( hyperlink );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
basicLink, beginTextInclusive, endTextExclusive );
}
/** /**
* This method shall store image bytes in external file and convert it if * This method shall store image bytes in external file and convert it if
* necessary. Images shall be stored using PNG format (for bitmap) or SVG * necessary. Images shall be stored using PNG format (for bitmap) or SVG
@ -299,13 +419,29 @@ public class WordToFoExtractor {
* HWPF object, contained picture data and properties * HWPF object, contained picture data and properties
*/ */
protected void processImage( Element currentBlock, boolean inlined, protected void processImage( Element currentBlock, boolean inlined,
Picture picture) { Picture picture )
{
// no default implementation -- skip // no default implementation -- skip
currentBlock.appendChild( document.createComment( "Image link to '"
+ picture.suggestFullFileName() + "' can be here" ) );
}
protected void processPageref( HWPFDocument hwpfDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel,
String pageref, int beginTextInclusive, int endTextExclusive )
{
Element basicLink = createBasicLinkInternal( pageref );
currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive )
processCharacters( hwpfDocument, currentTableLevel, paragraph,
basicLink, beginTextInclusive, endTextExclusive );
} }
protected void processParagraph( HWPFDocument hwpfDocument, protected void processParagraph( HWPFDocument hwpfDocument,
Element parentFopElement, int currentTableLevel, Element parentFopElement, int currentTableLevel,
Paragraph paragraph, String bulletText) { Paragraph paragraph, String bulletText )
{
final Element block = createBlock(); final Element block = createBlock();
parentFopElement.appendChild( block ); parentFopElement.appendChild( block );
@ -313,10 +449,12 @@ public class WordToFoExtractor {
final int charRuns = paragraph.numCharacterRuns(); final int charRuns = paragraph.numCharacterRuns();
if (charRuns == 0) { if ( charRuns == 0 )
{
return; return;
} }
{
final String pFontName; final String pFontName;
final int pFontSize; final int pFontSize;
final boolean pBold; final boolean pBold;
@ -333,97 +471,44 @@ public class WordToFoExtractor {
WordToFoUtils.setBold( block, pBold ); WordToFoUtils.setBold( block, pBold );
WordToFoUtils.setItalic( block, pItalic ); WordToFoUtils.setItalic( block, pItalic );
StringBuilder lineText = new StringBuilder(); blocksProperies.push( new BlockProperies( pFontName, pFontSize,
pBold, pItalic ) );
}
try
{
boolean haveAnyText = false;
if (WordToFoUtils.isNotEmpty(bulletText)) { if ( WordToFoUtils.isNotEmpty( bulletText ) )
{
Element inline = createInline(); Element inline = createInline();
block.appendChild( inline ); block.appendChild( inline );
Text textNode = createText( bulletText ); Text textNode = createText( bulletText );
inline.appendChild( textNode ); inline.appendChild( textNode );
lineText.append(bulletText); haveAnyText |= bulletText.trim().length() != 0;
} }
for (int c = 0; c < charRuns; c++) { haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
CharacterRun characterRun = paragraph.getCharacterRun(c); paragraph, block, 0, charRuns );
if (hwpfDocument.getPicturesTable().hasPicture(characterRun)) { if ( !haveAnyText )
Picture picture = hwpfDocument.getPicturesTable() {
.extractPicture(characterRun, true);
processImage(block, characterRun.text().charAt(0) == 0x01,
picture);
continue;
}
String text = characterRun.text();
if (text.getBytes().length == 0)
continue;
if (text.getBytes()[0] == FIELD_BEGIN_MARK) {
/*
* check if we have a field with calculated image as a result.
* MathType equation, for example.
*/
int skipTo = tryImageWithinField(hwpfDocument, paragraph, c,
block);
if (skipTo != c) {
c = skipTo;
continue;
}
continue;
}
if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) {
continue;
}
if (text.getBytes()[0] == FIELD_END_MARK) {
continue;
}
if (characterRun.isSpecialCharacter() || characterRun.isObj()
|| characterRun.isOle2()) {
continue;
}
Element inline = createInline();
if (characterRun.isBold() != pBold) {
WordToFoUtils.setBold(inline, characterRun.isBold());
}
if (characterRun.isItalic() != pItalic) {
WordToFoUtils.setItalic(inline, characterRun.isItalic());
}
if (!WordToFoUtils.equals(characterRun.getFontName(), pFontName)) {
WordToFoUtils.setFontFamily(inline, characterRun.getFontName());
}
if (characterRun.getFontSize() / 2 != pFontSize) {
WordToFoUtils.setFontSize(inline,
characterRun.getFontSize() / 2);
}
WordToFoUtils.setCharactersProperties(characterRun, inline);
block.appendChild(inline);
if (text.endsWith("\r")
|| (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != 0))
text = text.substring(0, text.length() - 1);
Text textNode = createText(text);
inline.appendChild(textNode);
lineText.append(text);
}
if (lineText.toString().trim().length() == 0) {
Element leader = createLeader(); Element leader = createLeader();
block.appendChild( leader ); block.appendChild( leader );
} }
}
finally
{
blocksProperies.pop();
}
return; return;
} }
protected void processSection( HWPFDocument hwpfDocument, Section section, protected void processSection( HWPFDocument hwpfDocument, Section section,
int sectionCounter) { int sectionCounter )
{
String regularPage = createPageMaster( String regularPage = createPageMaster(
WordToFoUtils.getSectionProperties( section ), "page", WordToFoUtils.getSectionProperties( section ), "page",
sectionCounter ); sectionCounter );
@ -435,10 +520,12 @@ public class WordToFoExtractor {
} }
protected void processSectionParagraphes( HWPFDocument hwpfDocument, protected void processSectionParagraphes( HWPFDocument hwpfDocument,
Element flow, Range range, int currentTableLevel) { Element flow, Range range, int currentTableLevel )
{
final Map<Integer, Table> allTables = new HashMap<Integer, Table>(); final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
for ( TableIterator tableIterator = WordToFoUtils.newTableIterator( for ( TableIterator tableIterator = WordToFoUtils.newTableIterator(
range, currentTableLevel + 1); tableIterator.hasNext();) { range, currentTableLevel + 1 ); tableIterator.hasNext(); )
{
Table next = tableIterator.next(); Table next = tableIterator.next();
allTables.put( Integer.valueOf( next.getStartOffset() ), next ); allTables.put( Integer.valueOf( next.getStartOffset() ), next );
} }
@ -447,11 +534,13 @@ public class WordToFoExtractor {
int currentListInfo = 0; int currentListInfo = 0;
final int paragraphs = range.numParagraphs(); final int paragraphs = range.numParagraphs();
for (int p = 0; p < paragraphs; p++) { for ( int p = 0; p < paragraphs; p++ )
{
Paragraph paragraph = range.getParagraph( p ); Paragraph paragraph = range.getParagraph( p );
if ( allTables.containsKey( Integer.valueOf( paragraph if ( allTables.containsKey( Integer.valueOf( paragraph
.getStartOffset()))) { .getStartOffset() ) ) )
{
Table table = allTables.get( Integer.valueOf( paragraph Table table = allTables.get( Integer.valueOf( paragraph
.getStartOffset() ) ); .getStartOffset() ) );
processTable( hwpfDocument, flow, table, currentTableLevel + 1 ); processTable( hwpfDocument, flow, table, currentTableLevel + 1 );
@ -459,15 +548,18 @@ public class WordToFoExtractor {
} }
if ( paragraph.isInTable() if ( paragraph.isInTable()
&& paragraph.getTableLevel() != currentTableLevel) { && paragraph.getTableLevel() != currentTableLevel )
{
continue; continue;
} }
if (paragraph.getIlfo() != currentListInfo) { if ( paragraph.getIlfo() != currentListInfo )
{
currentListInfo = paragraph.getIlfo(); currentListInfo = paragraph.getIlfo();
} }
if (currentListInfo != 0) { if ( currentListInfo != 0 )
{
final ListFormatOverride listFormatOverride = listTables final ListFormatOverride listFormatOverride = listTables
.getOverride( paragraph.getIlfo() ); .getOverride( paragraph.getIlfo() );
@ -476,7 +568,9 @@ public class WordToFoExtractor {
processParagraph( hwpfDocument, flow, currentTableLevel, processParagraph( hwpfDocument, flow, currentTableLevel,
paragraph, label ); paragraph, label );
} else { }
else
{
processParagraph( hwpfDocument, flow, currentTableLevel, processParagraph( hwpfDocument, flow, currentTableLevel,
paragraph, WordToFoUtils.EMPTY ); paragraph, WordToFoUtils.EMPTY );
} }
@ -485,7 +579,8 @@ public class WordToFoExtractor {
} }
protected void processTable( HWPFDocument hwpfDocument, Element flow, protected void processTable( HWPFDocument hwpfDocument, Element flow,
Table table, int thisTableLevel) { Table table, int thisTableLevel )
{
Element tableElement = addTable( flow ); Element tableElement = addTable( flow );
Element tableHeader = createTableHeader(); Element tableHeader = createTableHeader();
@ -494,18 +589,21 @@ public class WordToFoExtractor {
final int tableRows = table.numRows(); final int tableRows = table.numRows();
int maxColumns = Integer.MIN_VALUE; int maxColumns = Integer.MIN_VALUE;
for (int r = 0; r < tableRows; r++) { for ( int r = 0; r < tableRows; r++ )
{
maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() ); maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
} }
for (int r = 0; r < tableRows; r++) { for ( int r = 0; r < tableRows; r++ )
{
TableRow tableRow = table.getRow( r ); TableRow tableRow = table.getRow( r );
Element tableRowElement = createTableRow(); Element tableRowElement = createTableRow();
WordToFoUtils.setTableRowProperties( tableRow, tableRowElement ); WordToFoUtils.setTableRowProperties( tableRow, tableRowElement );
final int rowCells = tableRow.numCells(); final int rowCells = tableRow.numCells();
for (int c = 0; c < rowCells; c++) { for ( int c = 0; c < rowCells; c++ )
{
TableCell tableCell = tableRow.getCell( c ); TableCell tableCell = tableRow.getCell( c );
if ( tableCell.isMerged() && !tableCell.isFirstMerged() ) if ( tableCell.isMerged() && !tableCell.isFirstMerged() )
@ -520,9 +618,11 @@ public class WordToFoExtractor {
tableCellElement, r == 0, r == tableRows - 1, c == 0, tableCellElement, r == 0, r == tableRows - 1, c == 0,
c == rowCells - 1 ); c == rowCells - 1 );
if (tableCell.isFirstMerged()) { if ( tableCell.isFirstMerged() )
{
int count = 0; int count = 0;
for (int c1 = c; c1 < rowCells; c1++) { for ( int c1 = c; c1 < rowCells; c1++ )
{
TableCell nextCell = tableRow.getCell( c1 ); TableCell nextCell = tableRow.getCell( c1 );
if ( nextCell.isMerged() ) if ( nextCell.isMerged() )
count++; count++;
@ -531,16 +631,22 @@ public class WordToFoExtractor {
} }
tableCellElement.setAttribute( "number-columns-spanned", "" tableCellElement.setAttribute( "number-columns-spanned", ""
+ count ); + count );
} else { }
if (c == rowCells - 1 && c != maxColumns - 1) { else
tableCellElement.setAttribute("number-columns-spanned", {
"" + (maxColumns - c)); if ( c == rowCells - 1 && c != maxColumns - 1 )
{
tableCellElement
.setAttribute( "number-columns-spanned", ""
+ (maxColumns - c) );
} }
} }
if (tableCell.isFirstVerticallyMerged()) { if ( tableCell.isFirstVerticallyMerged() )
{
int count = 0; int count = 0;
for (int r1 = r; r1 < tableRows; r1++) { for ( int r1 = r; r1 < tableRows; r1++ )
{
TableRow nextRow = table.getRow( r1 ); TableRow nextRow = table.getRow( r1 );
if ( nextRow.numCells() < c ) if ( nextRow.numCells() < c )
break; break;
@ -557,45 +663,59 @@ public class WordToFoExtractor {
processSectionParagraphes( hwpfDocument, tableCellElement, processSectionParagraphes( hwpfDocument, tableCellElement,
tableCell, thisTableLevel ); tableCell, thisTableLevel );
if (!tableCellElement.hasChildNodes()) { if ( !tableCellElement.hasChildNodes() )
{
tableCellElement.appendChild( createBlock() ); tableCellElement.appendChild( createBlock() );
} }
tableRowElement.appendChild( tableCellElement ); tableRowElement.appendChild( tableCellElement );
} }
if (tableRow.isTableHeader()) { if ( tableRow.isTableHeader() )
{
tableHeader.appendChild( tableRowElement ); tableHeader.appendChild( tableRowElement );
} else { }
else
{
tableBody.appendChild( tableRowElement ); tableBody.appendChild( tableRowElement );
} }
} }
if (tableHeader.hasChildNodes()) { if ( tableHeader.hasChildNodes() )
{
tableElement.appendChild( tableHeader ); tableElement.appendChild( tableHeader );
} }
if (tableBody.hasChildNodes()) { if ( tableBody.hasChildNodes() )
{
tableElement.appendChild( tableBody ); tableElement.appendChild( tableBody );
} else { }
System.err.println("Table without body"); else
{
logger.log(
POILogger.WARN,
"Table without body starting on offset "
+ table.getStartOffset() + " -- "
+ table.getEndOffset() );
} }
} }
protected int tryImageWithinField(HWPFDocument hwpfDocument, protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph,
Paragraph paragraph, int beginMark, Element currentBlock) { int currentTableLevel, int beginMark, Element currentBlock )
{
int separatorMark = -1; int separatorMark = -1;
int pictureMark = -1;
int pictureChar = Integer.MIN_VALUE;
int endMark = -1; int endMark = -1;
for (int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++) { for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
{
CharacterRun characterRun = paragraph.getCharacterRun( c ); CharacterRun characterRun = paragraph.getCharacterRun( c );
String text = characterRun.text(); String text = characterRun.text();
if ( text.getBytes().length == 0 ) if ( text.getBytes().length == 0 )
continue; continue;
if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) { if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
if (separatorMark != -1) { {
if ( separatorMark != -1 )
{
// double; // double;
return beginMark; return beginMark;
} }
@ -604,8 +724,10 @@ public class WordToFoExtractor {
continue; continue;
} }
if (text.getBytes()[0] == FIELD_END_MARK) { if ( text.getBytes()[0] == FIELD_END_MARK )
if (endMark != -1) { {
if ( endMark != -1 )
{
// double; // double;
return beginMark; return beginMark;
} }
@ -614,63 +736,14 @@ public class WordToFoExtractor {
break; break;
} }
if (hwpfDocument.getPicturesTable().hasPicture(characterRun)) {
if (c != -1) {
// double;
return beginMark;
} }
pictureMark = c; if ( separatorMark == -1 || endMark == -1 )
pictureChar = characterRun.text().charAt(0);
continue;
}
}
if (separatorMark == -1 || pictureMark == -1 || endMark == -1)
return beginMark; return beginMark;
final CharacterRun pictureRun = paragraph.getCharacterRun(pictureMark); processField( hwpfDocument, currentBlock, paragraph, currentTableLevel,
final Picture picture = hwpfDocument.getPicturesTable().extractPicture( beginMark, separatorMark, endMark );
pictureRun, true);
processImage(currentBlock, pictureChar == 0x01, picture);
return endMark; return endMark;
} }
/**
* Java main() interface to interact with WordToFoExtractor
*
* <p>
* Usage: WordToFoExtractor infile outfile
* </p>
* Where infile is an input .doc file ( Word 97-2007)
* which will be rendered as XSL-FO into outfile
*
*/
public static void main(String[] args) {
if (args.length < 2) {
System.err.println("Usage: WordToFoExtractor <inputFile.doc> <saveTo.fo>");
return;
}
System.out.println("Converting " + args[0]);
System.out.println("Saving output to " + args[1]);
try {
Document doc = WordToFoExtractor.process(new File(args[0]));
FileWriter out = new FileWriter(args[1]);
DOMSource domSource = new DOMSource(doc);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); // TODO set encoding from a command argument
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.transform(domSource, streamResult);
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
} }

View File

@ -0,0 +1,95 @@
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/
package org.apache.poi.hwpf.extractor;
import java.io.StringWriter;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import junit.framework.TestCase;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocument;
/**
* Test cases for {@link WordToFoExtractor}
*
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
*/
public class TestWordToFoExtractor extends TestCase
{
private static String getFoText( final String sampleFileName )
throws Exception
{
HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples
.getDocumentInstance().openResourceAsStream( sampleFileName ) );
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToFoExtractor.processDocument( hwpfDocument );
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
transformer.transform(
new DOMSource( wordToFoExtractor.getDocument() ),
new StreamResult( stringWriter ) );
String result = stringWriter.toString();
return result;
}
public void testHyperlink() throws Exception
{
final String sampleFileName = "hyperlink.doc";
String result = getFoText( sampleFileName );
assertTrue( result
.contains( "<fo:basic-link external-destination=\"http://testuri.org/\">" ) );
assertTrue( result.contains( "Hyperlink text" ) );
}
public void testEquation() throws Exception
{
final String sampleFileName = "equation.doc";
String result = getFoText( sampleFileName );
assertTrue( result
.contains( "<!--Image link to '0.emf' can be here-->" ) );
}
public void testPageref() throws Exception
{
final String sampleFileName = "pageref.doc";
String result = getFoText( sampleFileName );
System.out.println( result );
assertTrue( result
.contains( "<fo:basic-link internal-destination=\"userref\">" ) );
assertTrue( result.contains( "1" ) );
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.