mirror of https://github.com/apache/poi.git
bug 51351: more progress with WordToFoExtractor: support for hyperlinks, common fields and code cleanup
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1137673 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
49448123e1
commit
21885a6fd5
|
@ -0,0 +1,206 @@
|
||||||
|
/*
|
||||||
|
* ====================================================================
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
* ====================================================================
|
||||||
|
*/
|
||||||
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
import org.w3c.dom.Text;
|
||||||
|
|
||||||
|
public abstract class AbstractToFoExtractor
|
||||||
|
{
|
||||||
|
|
||||||
|
private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
|
||||||
|
|
||||||
|
protected final Document document;
|
||||||
|
protected final Element layoutMasterSet;
|
||||||
|
protected final Element root;
|
||||||
|
|
||||||
|
public AbstractToFoExtractor( Document document )
|
||||||
|
{
|
||||||
|
this.document = document;
|
||||||
|
|
||||||
|
root = document.createElementNS( NS_XSLFO, "fo:root" );
|
||||||
|
document.appendChild( root );
|
||||||
|
|
||||||
|
layoutMasterSet = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:layout-master-set" );
|
||||||
|
root.appendChild( layoutMasterSet );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addFlowToPageSequence( final Element pageSequence,
|
||||||
|
String flowName )
|
||||||
|
{
|
||||||
|
final Element flow = document.createElementNS( NS_XSLFO, "fo:flow" );
|
||||||
|
flow.setAttribute( "flow-name", flowName );
|
||||||
|
pageSequence.appendChild( flow );
|
||||||
|
|
||||||
|
return flow;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addListItem( Element listBlock )
|
||||||
|
{
|
||||||
|
Element result = createListItem();
|
||||||
|
listBlock.appendChild( result );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addListItemBody( Element listItem )
|
||||||
|
{
|
||||||
|
Element result = createListItemBody();
|
||||||
|
listItem.appendChild( result );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addListItemLabel( Element listItem, String text )
|
||||||
|
{
|
||||||
|
Element result = createListItemLabel( text );
|
||||||
|
listItem.appendChild( result );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addPageSequence( String pageMaster )
|
||||||
|
{
|
||||||
|
final Element pageSequence = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:page-sequence" );
|
||||||
|
pageSequence.setAttribute( "master-reference", pageMaster );
|
||||||
|
root.appendChild( pageSequence );
|
||||||
|
return pageSequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addRegionBody( Element pageMaster )
|
||||||
|
{
|
||||||
|
final Element regionBody = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:region-body" );
|
||||||
|
pageMaster.appendChild( regionBody );
|
||||||
|
|
||||||
|
return regionBody;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addSimplePageMaster( String masterName )
|
||||||
|
{
|
||||||
|
final Element simplePageMaster = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:simple-page-master" );
|
||||||
|
simplePageMaster.setAttribute( "master-name", masterName );
|
||||||
|
layoutMasterSet.appendChild( simplePageMaster );
|
||||||
|
|
||||||
|
return simplePageMaster;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element addTable( Element flow )
|
||||||
|
{
|
||||||
|
final Element table = document.createElementNS( NS_XSLFO, "fo:table" );
|
||||||
|
flow.appendChild( table );
|
||||||
|
return table;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createBasicLinkExternal( String externalDestination )
|
||||||
|
{
|
||||||
|
final Element basicLink = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:basic-link" );
|
||||||
|
basicLink.setAttribute( "external-destination", externalDestination );
|
||||||
|
return basicLink;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createBasicLinkInternal( String internalDestination )
|
||||||
|
{
|
||||||
|
final Element basicLink = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:basic-link" );
|
||||||
|
basicLink.setAttribute( "internal-destination", internalDestination );
|
||||||
|
return basicLink;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createBlock()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:block" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createExternalGraphic( String source )
|
||||||
|
{
|
||||||
|
Element result = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:external-graphic" );
|
||||||
|
result.setAttribute( "src", "url('" + source + "')" );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createInline()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:inline" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createLeader()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:leader" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createListBlock()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:list-block" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createListItem()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:list-item" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createListItemBody()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:list-item-body" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createListItemLabel( String text )
|
||||||
|
{
|
||||||
|
Element result = document.createElementNS( NS_XSLFO,
|
||||||
|
"fo:list-item-label" );
|
||||||
|
Element block = createBlock();
|
||||||
|
block.appendChild( document.createTextNode( text ) );
|
||||||
|
result.appendChild( block );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createTableBody()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:table-body" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createTableCell()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:table-cell" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createTableHeader()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:table-header" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Element createTableRow()
|
||||||
|
{
|
||||||
|
return document.createElementNS( NS_XSLFO, "fo:table-row" );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Text createText( String data )
|
||||||
|
{
|
||||||
|
return document.createTextNode( data );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document getDocument()
|
||||||
|
{
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -16,7 +16,6 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
* ====================================================================
|
* ====================================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.poi.hwpf.extractor;
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
@ -25,6 +24,9 @@ import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Stack;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import javax.xml.parsers.DocumentBuilderFactory;
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
import javax.xml.transform.OutputKeys;
|
import javax.xml.transform.OutputKeys;
|
||||||
|
@ -46,6 +48,8 @@ import org.apache.poi.hwpf.usermodel.Table;
|
||||||
import org.apache.poi.hwpf.usermodel.TableCell;
|
import org.apache.poi.hwpf.usermodel.TableCell;
|
||||||
import org.apache.poi.hwpf.usermodel.TableIterator;
|
import org.apache.poi.hwpf.usermodel.TableIterator;
|
||||||
import org.apache.poi.hwpf.usermodel.TableRow;
|
import org.apache.poi.hwpf.usermodel.TableRow;
|
||||||
|
import org.apache.poi.util.POILogFactory;
|
||||||
|
import org.apache.poi.util.POILogger;
|
||||||
import org.w3c.dom.Document;
|
import org.w3c.dom.Document;
|
||||||
import org.w3c.dom.Element;
|
import org.w3c.dom.Element;
|
||||||
import org.w3c.dom.Text;
|
import org.w3c.dom.Text;
|
||||||
|
@ -55,7 +59,30 @@ import static org.apache.poi.hwpf.extractor.WordToFoUtils.TWIPS_PER_INCH;
|
||||||
/**
|
/**
|
||||||
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
|
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
|
||||||
*/
|
*/
|
||||||
public class WordToFoExtractor {
|
public class WordToFoExtractor extends AbstractToFoExtractor
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holds properties values, applied to current <tt>fo:block</tt> element.
|
||||||
|
* Those properties shall not be doubled in children <tt>fo:inline</tt>
|
||||||
|
* elements.
|
||||||
|
*/
|
||||||
|
private static class BlockProperies
|
||||||
|
{
|
||||||
|
final boolean pBold;
|
||||||
|
final String pFontName;
|
||||||
|
final int pFontSize;
|
||||||
|
final boolean pItalic;
|
||||||
|
|
||||||
|
public BlockProperies( String pFontName, int pFontSize, boolean pBold,
|
||||||
|
boolean pItalic )
|
||||||
|
{
|
||||||
|
this.pFontName = pFontName;
|
||||||
|
this.pFontSize = pFontSize;
|
||||||
|
this.pBold = pBold;
|
||||||
|
this.pItalic = pItalic;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static final byte BEL_MARK = 7;
|
private static final byte BEL_MARK = 7;
|
||||||
|
|
||||||
|
@ -65,22 +92,74 @@ public class WordToFoExtractor {
|
||||||
|
|
||||||
private static final byte FIELD_SEPARATOR_MARK = 20;
|
private static final byte FIELD_SEPARATOR_MARK = 20;
|
||||||
|
|
||||||
private static final String NS_XSLFO = "http://www.w3.org/1999/XSL/Format";
|
private static final POILogger logger = POILogFactory
|
||||||
|
.getLogger( WordToFoExtractor.class );
|
||||||
|
|
||||||
private static HWPFDocument loadDoc(File docFile) throws IOException {
|
private static HWPFDocument loadDoc( File docFile ) throws IOException
|
||||||
|
{
|
||||||
final FileInputStream istream = new FileInputStream( docFile );
|
final FileInputStream istream = new FileInputStream( docFile );
|
||||||
try {
|
try
|
||||||
|
{
|
||||||
return new HWPFDocument( istream );
|
return new HWPFDocument( istream );
|
||||||
} finally {
|
}
|
||||||
try {
|
finally
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
istream.close();
|
istream.close();
|
||||||
} catch (Exception exc) {
|
}
|
||||||
// no op
|
catch ( Exception exc )
|
||||||
|
{
|
||||||
|
logger.log( POILogger.ERROR,
|
||||||
|
"Unable to close FileInputStream: " + exc, exc );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static Document process(File docFile) throws Exception {
|
/**
|
||||||
|
* Java main() interface to interact with WordToFoExtractor
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Usage: WordToFoExtractor infile outfile
|
||||||
|
* </p>
|
||||||
|
* Where infile is an input .doc file ( Word 97-2007) which will be rendered
|
||||||
|
* as XSL-FO into outfile
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public static void main( String[] args )
|
||||||
|
{
|
||||||
|
if ( args.length < 2 )
|
||||||
|
{
|
||||||
|
System.err
|
||||||
|
.println( "Usage: WordToFoExtractor <inputFile.doc> <saveTo.fo>" );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println( "Converting " + args[0] );
|
||||||
|
System.out.println( "Saving output to " + args[1] );
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Document doc = WordToFoExtractor.process( new File( args[0] ) );
|
||||||
|
|
||||||
|
FileWriter out = new FileWriter( args[1] );
|
||||||
|
DOMSource domSource = new DOMSource( doc );
|
||||||
|
StreamResult streamResult = new StreamResult( out );
|
||||||
|
TransformerFactory tf = TransformerFactory.newInstance();
|
||||||
|
Transformer serializer = tf.newTransformer();
|
||||||
|
// TODO set encoding from a command argument
|
||||||
|
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
|
||||||
|
serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
|
||||||
|
serializer.transform( domSource, streamResult );
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
catch ( Exception e )
|
||||||
|
{
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static Document process( File docFile ) throws Exception
|
||||||
|
{
|
||||||
final HWPFDocument hwpfDocument = loadDoc( docFile );
|
final HWPFDocument hwpfDocument = loadDoc( docFile );
|
||||||
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
|
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
|
||||||
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||||
|
@ -89,123 +168,24 @@ public class WordToFoExtractor {
|
||||||
return wordToFoExtractor.getDocument();
|
return wordToFoExtractor.getDocument();
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Document document;
|
private final Stack<BlockProperies> blocksProperies = new Stack<BlockProperies>();
|
||||||
|
|
||||||
private final Element layoutMasterSet;
|
/**
|
||||||
|
* Creates new instance of {@link WordToFoExtractor}. Can be used for output
|
||||||
private final Element root;
|
* several {@link HWPFDocument}s into single FO document.
|
||||||
|
*
|
||||||
public WordToFoExtractor(Document document) throws Exception {
|
* @param document
|
||||||
this.document = document;
|
* XML DOM Document used as XSL FO document. Shall support
|
||||||
|
* namespaces
|
||||||
root = document.createElementNS(NS_XSLFO, "fo:root");
|
*/
|
||||||
document.appendChild(root);
|
public WordToFoExtractor( Document document )
|
||||||
|
{
|
||||||
layoutMasterSet = document.createElementNS(NS_XSLFO,
|
super( document );
|
||||||
"fo:layout-master-set");
|
|
||||||
root.appendChild(layoutMasterSet);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addFlowToPageSequence(final Element pageSequence,
|
|
||||||
String flowName) {
|
|
||||||
final Element flow = document.createElementNS(NS_XSLFO, "fo:flow");
|
|
||||||
flow.setAttribute("flow-name", flowName);
|
|
||||||
pageSequence.appendChild(flow);
|
|
||||||
|
|
||||||
return flow;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addListItem(Element listBlock) {
|
|
||||||
Element result = createListItem();
|
|
||||||
listBlock.appendChild(result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addListItemBody(Element listItem) {
|
|
||||||
Element result = createListItemBody();
|
|
||||||
listItem.appendChild(result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addListItemLabel(Element listItem, String text) {
|
|
||||||
Element result = createListItemLabel(text);
|
|
||||||
listItem.appendChild(result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addPageSequence(String pageMaster) {
|
|
||||||
final Element pageSequence = document.createElementNS(NS_XSLFO,
|
|
||||||
"fo:page-sequence");
|
|
||||||
pageSequence.setAttribute("master-reference", pageMaster);
|
|
||||||
root.appendChild(pageSequence);
|
|
||||||
return pageSequence;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addRegionBody(Element pageMaster) {
|
|
||||||
final Element regionBody = document.createElementNS(NS_XSLFO,
|
|
||||||
"fo:region-body");
|
|
||||||
pageMaster.appendChild(regionBody);
|
|
||||||
|
|
||||||
return regionBody;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addSimplePageMaster(String masterName) {
|
|
||||||
final Element simplePageMaster = document.createElementNS(NS_XSLFO,
|
|
||||||
"fo:simple-page-master");
|
|
||||||
simplePageMaster.setAttribute("master-name", masterName);
|
|
||||||
layoutMasterSet.appendChild(simplePageMaster);
|
|
||||||
|
|
||||||
return simplePageMaster;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element addTable(Element flow) {
|
|
||||||
final Element table = document.createElementNS(NS_XSLFO, "fo:table");
|
|
||||||
flow.appendChild(table);
|
|
||||||
return table;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createBlock() {
|
|
||||||
return document.createElementNS(NS_XSLFO, "fo:block");
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createExternalGraphic(String source) {
|
|
||||||
Element result = document.createElementNS(NS_XSLFO,
|
|
||||||
"fo:external-graphic");
|
|
||||||
result.setAttribute("src", "url('" + source + "')");
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createInline() {
|
|
||||||
return document.createElementNS(NS_XSLFO, "fo:inline");
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createLeader() {
|
|
||||||
return document.createElementNS(NS_XSLFO, "fo:leader");
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createListBlock() {
|
|
||||||
return document.createElementNS(NS_XSLFO, "fo:list-block");
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createListItem() {
|
|
||||||
return document.createElementNS(NS_XSLFO, "fo:list-item");
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createListItemBody() {
|
|
||||||
return document.createElementNS(NS_XSLFO, "fo:list-item-body");
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Element createListItemLabel(String text) {
|
|
||||||
Element result = document.createElementNS(NS_XSLFO,
|
|
||||||
"fo:list-item-label");
|
|
||||||
Element block = createBlock();
|
|
||||||
block.appendChild(document.createTextNode(text));
|
|
||||||
result.appendChild(block);
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String createPageMaster( SectionProperties sep, String type,
|
protected String createPageMaster( SectionProperties sep, String type,
|
||||||
int section) {
|
int section )
|
||||||
|
{
|
||||||
float height = sep.getYaPage() / TWIPS_PER_INCH;
|
float height = sep.getYaPage() / TWIPS_PER_INCH;
|
||||||
float width = sep.getXaPage() / TWIPS_PER_INCH;
|
float width = sep.getXaPage() / TWIPS_PER_INCH;
|
||||||
float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
|
float leftMargin = sep.getDxaLeft() / TWIPS_PER_INCH;
|
||||||
|
@ -234,12 +214,17 @@ public class WordToFoExtractor {
|
||||||
// WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left");
|
// WordToFoUtils.setBorder(regionBody, sep.getBrcLeft(), "left");
|
||||||
// WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right");
|
// WordToFoUtils.setBorder(regionBody, sep.getBrcRight(), "right");
|
||||||
|
|
||||||
if (sep.getCcolM1() > 0) {
|
if ( sep.getCcolM1() > 0 )
|
||||||
regionBody.setAttribute("column-count", "" + (sep.getCcolM1() + 1));
|
{
|
||||||
if (sep.getFEvenlySpaced()) {
|
regionBody
|
||||||
|
.setAttribute( "column-count", "" + (sep.getCcolM1() + 1) );
|
||||||
|
if ( sep.getFEvenlySpaced() )
|
||||||
|
{
|
||||||
regionBody.setAttribute( "column-gap",
|
regionBody.setAttribute( "column-gap",
|
||||||
(sep.getDxaColumns() / TWIPS_PER_INCH) + "in" );
|
(sep.getDxaColumns() / TWIPS_PER_INCH) + "in" );
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
regionBody.setAttribute( "column-gap", "0.25in" );
|
regionBody.setAttribute( "column-gap", "0.25in" );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -247,38 +232,173 @@ public class WordToFoExtractor {
|
||||||
return pageMasterName;
|
return pageMasterName;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Element createTableBody() {
|
protected boolean processCharacters( HWPFDocument hwpfDocument,
|
||||||
return document.createElementNS(NS_XSLFO, "fo:table-body");
|
int currentTableLevel, Paragraph paragraph, final Element block,
|
||||||
|
final int start, final int end )
|
||||||
|
{
|
||||||
|
boolean haveAnyText = false;
|
||||||
|
|
||||||
|
for ( int c = start; c < end; c++ )
|
||||||
|
{
|
||||||
|
CharacterRun characterRun = paragraph.getCharacterRun( c );
|
||||||
|
|
||||||
|
if ( hwpfDocument.getPicturesTable().hasPicture( characterRun ) )
|
||||||
|
{
|
||||||
|
Picture picture = hwpfDocument.getPicturesTable()
|
||||||
|
.extractPicture( characterRun, true );
|
||||||
|
|
||||||
|
processImage( block, characterRun.text().charAt( 0 ) == 0x01,
|
||||||
|
picture );
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Element createTableCell() {
|
String text = characterRun.text();
|
||||||
return document.createElementNS(NS_XSLFO, "fo:table-cell");
|
if ( text.getBytes().length == 0 )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
|
||||||
|
{
|
||||||
|
int skipTo = tryField( hwpfDocument, paragraph,
|
||||||
|
currentTableLevel, c, block );
|
||||||
|
|
||||||
|
if ( skipTo != c )
|
||||||
|
{
|
||||||
|
c = skipTo;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Element createTableHeader() {
|
continue;
|
||||||
return document.createElementNS(NS_XSLFO, "fo:table-header");
|
}
|
||||||
|
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
|
||||||
|
{
|
||||||
|
// shall not appear without FIELD_BEGIN_MARK
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ( text.getBytes()[0] == FIELD_END_MARK )
|
||||||
|
{
|
||||||
|
// shall not appear without FIELD_BEGIN_MARK
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Element createTableRow() {
|
if ( characterRun.isSpecialCharacter() || characterRun.isObj()
|
||||||
return document.createElementNS(NS_XSLFO, "fo:table-row");
|
|| characterRun.isOle2() )
|
||||||
|
{
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Text createText(String data) {
|
BlockProperies blockProperies = this.blocksProperies.peek();
|
||||||
return document.createTextNode(data);
|
Element inline = createInline();
|
||||||
|
if ( characterRun.isBold() != blockProperies.pBold )
|
||||||
|
{
|
||||||
|
WordToFoUtils.setBold( inline, characterRun.isBold() );
|
||||||
|
}
|
||||||
|
if ( characterRun.isItalic() != blockProperies.pItalic )
|
||||||
|
{
|
||||||
|
WordToFoUtils.setItalic( inline, characterRun.isItalic() );
|
||||||
|
}
|
||||||
|
if ( !WordToFoUtils.equals( characterRun.getFontName(),
|
||||||
|
blockProperies.pFontName ) )
|
||||||
|
{
|
||||||
|
WordToFoUtils
|
||||||
|
.setFontFamily( inline, characterRun.getFontName() );
|
||||||
|
}
|
||||||
|
if ( characterRun.getFontSize() / 2 != blockProperies.pFontSize )
|
||||||
|
{
|
||||||
|
WordToFoUtils.setFontSize( inline,
|
||||||
|
characterRun.getFontSize() / 2 );
|
||||||
|
}
|
||||||
|
WordToFoUtils.setCharactersProperties( characterRun, inline );
|
||||||
|
block.appendChild( inline );
|
||||||
|
|
||||||
|
if ( text.endsWith( "\r" )
|
||||||
|
|| (text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != 0) )
|
||||||
|
text = text.substring( 0, text.length() - 1 );
|
||||||
|
|
||||||
|
Text textNode = createText( text );
|
||||||
|
inline.appendChild( textNode );
|
||||||
|
|
||||||
|
haveAnyText |= text.trim().length() != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Document getDocument() {
|
return haveAnyText;
|
||||||
return document;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void processDocument(HWPFDocument hwpfDocument) {
|
public void processDocument( HWPFDocument hwpfDocument )
|
||||||
|
{
|
||||||
final Range range = hwpfDocument.getRange();
|
final Range range = hwpfDocument.getRange();
|
||||||
|
|
||||||
for (int s = 0; s < range.numSections(); s++) {
|
for ( int s = 0; s < range.numSections(); s++ )
|
||||||
|
{
|
||||||
processSection( hwpfDocument, range.getSection( s ), s );
|
processSection( hwpfDocument, range.getSection( s ), s );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void processField( HWPFDocument hwpfDocument,
|
||||||
|
Element currentBlock, Paragraph paragraph, int currentTableLevel,
|
||||||
|
int beginMark, int separatorMark, int endMark )
|
||||||
|
{
|
||||||
|
|
||||||
|
Pattern hyperlinkPattern = Pattern
|
||||||
|
.compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
|
||||||
|
Pattern pagerefPattern = Pattern
|
||||||
|
.compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
|
||||||
|
|
||||||
|
if ( separatorMark - beginMark > 1 )
|
||||||
|
{
|
||||||
|
CharacterRun firstAfterBegin = paragraph
|
||||||
|
.getCharacterRun( beginMark + 1 );
|
||||||
|
|
||||||
|
final Matcher hyperlinkMatcher = hyperlinkPattern
|
||||||
|
.matcher( firstAfterBegin.text() );
|
||||||
|
if ( hyperlinkMatcher.matches() )
|
||||||
|
{
|
||||||
|
String hyperlink = hyperlinkMatcher.group( 1 );
|
||||||
|
processHyperlink( hwpfDocument, currentBlock, paragraph,
|
||||||
|
currentTableLevel, hyperlink, separatorMark + 1,
|
||||||
|
endMark );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
final Matcher pagerefMatcher = pagerefPattern
|
||||||
|
.matcher( firstAfterBegin.text() );
|
||||||
|
if ( pagerefMatcher.matches() )
|
||||||
|
{
|
||||||
|
String pageref = pagerefMatcher.group( 1 );
|
||||||
|
processPageref( hwpfDocument, currentBlock, paragraph,
|
||||||
|
currentTableLevel, pageref, separatorMark + 1, endMark );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
|
||||||
|
for ( int i = beginMark; i <= endMark; i++ )
|
||||||
|
{
|
||||||
|
debug.append( "\t" );
|
||||||
|
debug.append( paragraph.getCharacterRun( i ) );
|
||||||
|
debug.append( "\n" );
|
||||||
|
}
|
||||||
|
logger.log( POILogger.WARN, debug );
|
||||||
|
|
||||||
|
// just output field value
|
||||||
|
if ( separatorMark + 1 < endMark )
|
||||||
|
processCharacters( hwpfDocument, currentTableLevel, paragraph,
|
||||||
|
currentBlock, separatorMark + 1, endMark );
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void processHyperlink( HWPFDocument hwpfDocument,
|
||||||
|
Element currentBlock, Paragraph paragraph, int currentTableLevel,
|
||||||
|
String hyperlink, int beginTextInclusive, int endTextExclusive )
|
||||||
|
{
|
||||||
|
Element basicLink = createBasicLinkExternal( hyperlink );
|
||||||
|
currentBlock.appendChild( basicLink );
|
||||||
|
|
||||||
|
if ( beginTextInclusive < endTextExclusive )
|
||||||
|
processCharacters( hwpfDocument, currentTableLevel, paragraph,
|
||||||
|
basicLink, beginTextInclusive, endTextExclusive );
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method shall store image bytes in external file and convert it if
|
* This method shall store image bytes in external file and convert it if
|
||||||
* necessary. Images shall be stored using PNG format (for bitmap) or SVG
|
* necessary. Images shall be stored using PNG format (for bitmap) or SVG
|
||||||
|
@ -299,13 +419,29 @@ public class WordToFoExtractor {
|
||||||
* HWPF object, contained picture data and properties
|
* HWPF object, contained picture data and properties
|
||||||
*/
|
*/
|
||||||
protected void processImage( Element currentBlock, boolean inlined,
|
protected void processImage( Element currentBlock, boolean inlined,
|
||||||
Picture picture) {
|
Picture picture )
|
||||||
|
{
|
||||||
// no default implementation -- skip
|
// no default implementation -- skip
|
||||||
|
currentBlock.appendChild( document.createComment( "Image link to '"
|
||||||
|
+ picture.suggestFullFileName() + "' can be here" ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void processPageref( HWPFDocument hwpfDocument,
|
||||||
|
Element currentBlock, Paragraph paragraph, int currentTableLevel,
|
||||||
|
String pageref, int beginTextInclusive, int endTextExclusive )
|
||||||
|
{
|
||||||
|
Element basicLink = createBasicLinkInternal( pageref );
|
||||||
|
currentBlock.appendChild( basicLink );
|
||||||
|
|
||||||
|
if ( beginTextInclusive < endTextExclusive )
|
||||||
|
processCharacters( hwpfDocument, currentTableLevel, paragraph,
|
||||||
|
basicLink, beginTextInclusive, endTextExclusive );
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void processParagraph( HWPFDocument hwpfDocument,
|
protected void processParagraph( HWPFDocument hwpfDocument,
|
||||||
Element parentFopElement, int currentTableLevel,
|
Element parentFopElement, int currentTableLevel,
|
||||||
Paragraph paragraph, String bulletText) {
|
Paragraph paragraph, String bulletText )
|
||||||
|
{
|
||||||
final Element block = createBlock();
|
final Element block = createBlock();
|
||||||
parentFopElement.appendChild( block );
|
parentFopElement.appendChild( block );
|
||||||
|
|
||||||
|
@ -313,10 +449,12 @@ public class WordToFoExtractor {
|
||||||
|
|
||||||
final int charRuns = paragraph.numCharacterRuns();
|
final int charRuns = paragraph.numCharacterRuns();
|
||||||
|
|
||||||
if (charRuns == 0) {
|
if ( charRuns == 0 )
|
||||||
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
final String pFontName;
|
final String pFontName;
|
||||||
final int pFontSize;
|
final int pFontSize;
|
||||||
final boolean pBold;
|
final boolean pBold;
|
||||||
|
@ -333,97 +471,44 @@ public class WordToFoExtractor {
|
||||||
WordToFoUtils.setBold( block, pBold );
|
WordToFoUtils.setBold( block, pBold );
|
||||||
WordToFoUtils.setItalic( block, pItalic );
|
WordToFoUtils.setItalic( block, pItalic );
|
||||||
|
|
||||||
StringBuilder lineText = new StringBuilder();
|
blocksProperies.push( new BlockProperies( pFontName, pFontSize,
|
||||||
|
pBold, pItalic ) );
|
||||||
|
}
|
||||||
|
try
|
||||||
|
{
|
||||||
|
boolean haveAnyText = false;
|
||||||
|
|
||||||
if (WordToFoUtils.isNotEmpty(bulletText)) {
|
if ( WordToFoUtils.isNotEmpty( bulletText ) )
|
||||||
|
{
|
||||||
Element inline = createInline();
|
Element inline = createInline();
|
||||||
block.appendChild( inline );
|
block.appendChild( inline );
|
||||||
|
|
||||||
Text textNode = createText( bulletText );
|
Text textNode = createText( bulletText );
|
||||||
inline.appendChild( textNode );
|
inline.appendChild( textNode );
|
||||||
|
|
||||||
lineText.append(bulletText);
|
haveAnyText |= bulletText.trim().length() != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int c = 0; c < charRuns; c++) {
|
haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
|
||||||
CharacterRun characterRun = paragraph.getCharacterRun(c);
|
paragraph, block, 0, charRuns );
|
||||||
|
|
||||||
if (hwpfDocument.getPicturesTable().hasPicture(characterRun)) {
|
if ( !haveAnyText )
|
||||||
Picture picture = hwpfDocument.getPicturesTable()
|
{
|
||||||
.extractPicture(characterRun, true);
|
|
||||||
|
|
||||||
processImage(block, characterRun.text().charAt(0) == 0x01,
|
|
||||||
picture);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
String text = characterRun.text();
|
|
||||||
if (text.getBytes().length == 0)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (text.getBytes()[0] == FIELD_BEGIN_MARK) {
|
|
||||||
/*
|
|
||||||
* check if we have a field with calculated image as a result.
|
|
||||||
* MathType equation, for example.
|
|
||||||
*/
|
|
||||||
int skipTo = tryImageWithinField(hwpfDocument, paragraph, c,
|
|
||||||
block);
|
|
||||||
|
|
||||||
if (skipTo != c) {
|
|
||||||
c = skipTo;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (text.getBytes()[0] == FIELD_END_MARK) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (characterRun.isSpecialCharacter() || characterRun.isObj()
|
|
||||||
|| characterRun.isOle2()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Element inline = createInline();
|
|
||||||
if (characterRun.isBold() != pBold) {
|
|
||||||
WordToFoUtils.setBold(inline, characterRun.isBold());
|
|
||||||
}
|
|
||||||
if (characterRun.isItalic() != pItalic) {
|
|
||||||
WordToFoUtils.setItalic(inline, characterRun.isItalic());
|
|
||||||
}
|
|
||||||
if (!WordToFoUtils.equals(characterRun.getFontName(), pFontName)) {
|
|
||||||
WordToFoUtils.setFontFamily(inline, characterRun.getFontName());
|
|
||||||
}
|
|
||||||
if (characterRun.getFontSize() / 2 != pFontSize) {
|
|
||||||
WordToFoUtils.setFontSize(inline,
|
|
||||||
characterRun.getFontSize() / 2);
|
|
||||||
}
|
|
||||||
WordToFoUtils.setCharactersProperties(characterRun, inline);
|
|
||||||
block.appendChild(inline);
|
|
||||||
|
|
||||||
if (text.endsWith("\r")
|
|
||||||
|| (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != 0))
|
|
||||||
text = text.substring(0, text.length() - 1);
|
|
||||||
|
|
||||||
Text textNode = createText(text);
|
|
||||||
inline.appendChild(textNode);
|
|
||||||
|
|
||||||
lineText.append(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lineText.toString().trim().length() == 0) {
|
|
||||||
Element leader = createLeader();
|
Element leader = createLeader();
|
||||||
block.appendChild( leader );
|
block.appendChild( leader );
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
blocksProperies.pop();
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void processSection( HWPFDocument hwpfDocument, Section section,
|
protected void processSection( HWPFDocument hwpfDocument, Section section,
|
||||||
int sectionCounter) {
|
int sectionCounter )
|
||||||
|
{
|
||||||
String regularPage = createPageMaster(
|
String regularPage = createPageMaster(
|
||||||
WordToFoUtils.getSectionProperties( section ), "page",
|
WordToFoUtils.getSectionProperties( section ), "page",
|
||||||
sectionCounter );
|
sectionCounter );
|
||||||
|
@ -435,10 +520,12 @@ public class WordToFoExtractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void processSectionParagraphes( HWPFDocument hwpfDocument,
|
protected void processSectionParagraphes( HWPFDocument hwpfDocument,
|
||||||
Element flow, Range range, int currentTableLevel) {
|
Element flow, Range range, int currentTableLevel )
|
||||||
|
{
|
||||||
final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
|
final Map<Integer, Table> allTables = new HashMap<Integer, Table>();
|
||||||
for ( TableIterator tableIterator = WordToFoUtils.newTableIterator(
|
for ( TableIterator tableIterator = WordToFoUtils.newTableIterator(
|
||||||
range, currentTableLevel + 1); tableIterator.hasNext();) {
|
range, currentTableLevel + 1 ); tableIterator.hasNext(); )
|
||||||
|
{
|
||||||
Table next = tableIterator.next();
|
Table next = tableIterator.next();
|
||||||
allTables.put( Integer.valueOf( next.getStartOffset() ), next );
|
allTables.put( Integer.valueOf( next.getStartOffset() ), next );
|
||||||
}
|
}
|
||||||
|
@ -447,11 +534,13 @@ public class WordToFoExtractor {
|
||||||
int currentListInfo = 0;
|
int currentListInfo = 0;
|
||||||
|
|
||||||
final int paragraphs = range.numParagraphs();
|
final int paragraphs = range.numParagraphs();
|
||||||
for (int p = 0; p < paragraphs; p++) {
|
for ( int p = 0; p < paragraphs; p++ )
|
||||||
|
{
|
||||||
Paragraph paragraph = range.getParagraph( p );
|
Paragraph paragraph = range.getParagraph( p );
|
||||||
|
|
||||||
if ( allTables.containsKey( Integer.valueOf( paragraph
|
if ( allTables.containsKey( Integer.valueOf( paragraph
|
||||||
.getStartOffset()))) {
|
.getStartOffset() ) ) )
|
||||||
|
{
|
||||||
Table table = allTables.get( Integer.valueOf( paragraph
|
Table table = allTables.get( Integer.valueOf( paragraph
|
||||||
.getStartOffset() ) );
|
.getStartOffset() ) );
|
||||||
processTable( hwpfDocument, flow, table, currentTableLevel + 1 );
|
processTable( hwpfDocument, flow, table, currentTableLevel + 1 );
|
||||||
|
@ -459,15 +548,18 @@ public class WordToFoExtractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( paragraph.isInTable()
|
if ( paragraph.isInTable()
|
||||||
&& paragraph.getTableLevel() != currentTableLevel) {
|
&& paragraph.getTableLevel() != currentTableLevel )
|
||||||
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (paragraph.getIlfo() != currentListInfo) {
|
if ( paragraph.getIlfo() != currentListInfo )
|
||||||
|
{
|
||||||
currentListInfo = paragraph.getIlfo();
|
currentListInfo = paragraph.getIlfo();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (currentListInfo != 0) {
|
if ( currentListInfo != 0 )
|
||||||
|
{
|
||||||
final ListFormatOverride listFormatOverride = listTables
|
final ListFormatOverride listFormatOverride = listTables
|
||||||
.getOverride( paragraph.getIlfo() );
|
.getOverride( paragraph.getIlfo() );
|
||||||
|
|
||||||
|
@ -476,7 +568,9 @@ public class WordToFoExtractor {
|
||||||
|
|
||||||
processParagraph( hwpfDocument, flow, currentTableLevel,
|
processParagraph( hwpfDocument, flow, currentTableLevel,
|
||||||
paragraph, label );
|
paragraph, label );
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
processParagraph( hwpfDocument, flow, currentTableLevel,
|
processParagraph( hwpfDocument, flow, currentTableLevel,
|
||||||
paragraph, WordToFoUtils.EMPTY );
|
paragraph, WordToFoUtils.EMPTY );
|
||||||
}
|
}
|
||||||
|
@ -485,7 +579,8 @@ public class WordToFoExtractor {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void processTable( HWPFDocument hwpfDocument, Element flow,
|
protected void processTable( HWPFDocument hwpfDocument, Element flow,
|
||||||
Table table, int thisTableLevel) {
|
Table table, int thisTableLevel )
|
||||||
|
{
|
||||||
Element tableElement = addTable( flow );
|
Element tableElement = addTable( flow );
|
||||||
|
|
||||||
Element tableHeader = createTableHeader();
|
Element tableHeader = createTableHeader();
|
||||||
|
@ -494,18 +589,21 @@ public class WordToFoExtractor {
|
||||||
final int tableRows = table.numRows();
|
final int tableRows = table.numRows();
|
||||||
|
|
||||||
int maxColumns = Integer.MIN_VALUE;
|
int maxColumns = Integer.MIN_VALUE;
|
||||||
for (int r = 0; r < tableRows; r++) {
|
for ( int r = 0; r < tableRows; r++ )
|
||||||
|
{
|
||||||
maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
|
maxColumns = Math.max( maxColumns, table.getRow( r ).numCells() );
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int r = 0; r < tableRows; r++) {
|
for ( int r = 0; r < tableRows; r++ )
|
||||||
|
{
|
||||||
TableRow tableRow = table.getRow( r );
|
TableRow tableRow = table.getRow( r );
|
||||||
|
|
||||||
Element tableRowElement = createTableRow();
|
Element tableRowElement = createTableRow();
|
||||||
WordToFoUtils.setTableRowProperties( tableRow, tableRowElement );
|
WordToFoUtils.setTableRowProperties( tableRow, tableRowElement );
|
||||||
|
|
||||||
final int rowCells = tableRow.numCells();
|
final int rowCells = tableRow.numCells();
|
||||||
for (int c = 0; c < rowCells; c++) {
|
for ( int c = 0; c < rowCells; c++ )
|
||||||
|
{
|
||||||
TableCell tableCell = tableRow.getCell( c );
|
TableCell tableCell = tableRow.getCell( c );
|
||||||
|
|
||||||
if ( tableCell.isMerged() && !tableCell.isFirstMerged() )
|
if ( tableCell.isMerged() && !tableCell.isFirstMerged() )
|
||||||
|
@ -520,9 +618,11 @@ public class WordToFoExtractor {
|
||||||
tableCellElement, r == 0, r == tableRows - 1, c == 0,
|
tableCellElement, r == 0, r == tableRows - 1, c == 0,
|
||||||
c == rowCells - 1 );
|
c == rowCells - 1 );
|
||||||
|
|
||||||
if (tableCell.isFirstMerged()) {
|
if ( tableCell.isFirstMerged() )
|
||||||
|
{
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (int c1 = c; c1 < rowCells; c1++) {
|
for ( int c1 = c; c1 < rowCells; c1++ )
|
||||||
|
{
|
||||||
TableCell nextCell = tableRow.getCell( c1 );
|
TableCell nextCell = tableRow.getCell( c1 );
|
||||||
if ( nextCell.isMerged() )
|
if ( nextCell.isMerged() )
|
||||||
count++;
|
count++;
|
||||||
|
@ -531,16 +631,22 @@ public class WordToFoExtractor {
|
||||||
}
|
}
|
||||||
tableCellElement.setAttribute( "number-columns-spanned", ""
|
tableCellElement.setAttribute( "number-columns-spanned", ""
|
||||||
+ count );
|
+ count );
|
||||||
} else {
|
}
|
||||||
if (c == rowCells - 1 && c != maxColumns - 1) {
|
else
|
||||||
tableCellElement.setAttribute("number-columns-spanned",
|
{
|
||||||
"" + (maxColumns - c));
|
if ( c == rowCells - 1 && c != maxColumns - 1 )
|
||||||
|
{
|
||||||
|
tableCellElement
|
||||||
|
.setAttribute( "number-columns-spanned", ""
|
||||||
|
+ (maxColumns - c) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tableCell.isFirstVerticallyMerged()) {
|
if ( tableCell.isFirstVerticallyMerged() )
|
||||||
|
{
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (int r1 = r; r1 < tableRows; r1++) {
|
for ( int r1 = r; r1 < tableRows; r1++ )
|
||||||
|
{
|
||||||
TableRow nextRow = table.getRow( r1 );
|
TableRow nextRow = table.getRow( r1 );
|
||||||
if ( nextRow.numCells() < c )
|
if ( nextRow.numCells() < c )
|
||||||
break;
|
break;
|
||||||
|
@ -557,45 +663,59 @@ public class WordToFoExtractor {
|
||||||
processSectionParagraphes( hwpfDocument, tableCellElement,
|
processSectionParagraphes( hwpfDocument, tableCellElement,
|
||||||
tableCell, thisTableLevel );
|
tableCell, thisTableLevel );
|
||||||
|
|
||||||
if (!tableCellElement.hasChildNodes()) {
|
if ( !tableCellElement.hasChildNodes() )
|
||||||
|
{
|
||||||
tableCellElement.appendChild( createBlock() );
|
tableCellElement.appendChild( createBlock() );
|
||||||
}
|
}
|
||||||
|
|
||||||
tableRowElement.appendChild( tableCellElement );
|
tableRowElement.appendChild( tableCellElement );
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tableRow.isTableHeader()) {
|
if ( tableRow.isTableHeader() )
|
||||||
|
{
|
||||||
tableHeader.appendChild( tableRowElement );
|
tableHeader.appendChild( tableRowElement );
|
||||||
} else {
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
tableBody.appendChild( tableRowElement );
|
tableBody.appendChild( tableRowElement );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tableHeader.hasChildNodes()) {
|
if ( tableHeader.hasChildNodes() )
|
||||||
|
{
|
||||||
tableElement.appendChild( tableHeader );
|
tableElement.appendChild( tableHeader );
|
||||||
}
|
}
|
||||||
if (tableBody.hasChildNodes()) {
|
if ( tableBody.hasChildNodes() )
|
||||||
|
{
|
||||||
tableElement.appendChild( tableBody );
|
tableElement.appendChild( tableBody );
|
||||||
} else {
|
}
|
||||||
System.err.println("Table without body");
|
else
|
||||||
|
{
|
||||||
|
logger.log(
|
||||||
|
POILogger.WARN,
|
||||||
|
"Table without body starting on offset "
|
||||||
|
+ table.getStartOffset() + " -- "
|
||||||
|
+ table.getEndOffset() );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected int tryImageWithinField(HWPFDocument hwpfDocument,
|
protected int tryField( HWPFDocument hwpfDocument, Paragraph paragraph,
|
||||||
Paragraph paragraph, int beginMark, Element currentBlock) {
|
int currentTableLevel, int beginMark, Element currentBlock )
|
||||||
|
{
|
||||||
int separatorMark = -1;
|
int separatorMark = -1;
|
||||||
int pictureMark = -1;
|
|
||||||
int pictureChar = Integer.MIN_VALUE;
|
|
||||||
int endMark = -1;
|
int endMark = -1;
|
||||||
for (int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++) {
|
for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ )
|
||||||
|
{
|
||||||
CharacterRun characterRun = paragraph.getCharacterRun( c );
|
CharacterRun characterRun = paragraph.getCharacterRun( c );
|
||||||
|
|
||||||
String text = characterRun.text();
|
String text = characterRun.text();
|
||||||
if ( text.getBytes().length == 0 )
|
if ( text.getBytes().length == 0 )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) {
|
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
|
||||||
if (separatorMark != -1) {
|
{
|
||||||
|
if ( separatorMark != -1 )
|
||||||
|
{
|
||||||
// double;
|
// double;
|
||||||
return beginMark;
|
return beginMark;
|
||||||
}
|
}
|
||||||
|
@ -604,8 +724,10 @@ public class WordToFoExtractor {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (text.getBytes()[0] == FIELD_END_MARK) {
|
if ( text.getBytes()[0] == FIELD_END_MARK )
|
||||||
if (endMark != -1) {
|
{
|
||||||
|
if ( endMark != -1 )
|
||||||
|
{
|
||||||
// double;
|
// double;
|
||||||
return beginMark;
|
return beginMark;
|
||||||
}
|
}
|
||||||
|
@ -614,63 +736,14 @@ public class WordToFoExtractor {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hwpfDocument.getPicturesTable().hasPicture(characterRun)) {
|
|
||||||
if (c != -1) {
|
|
||||||
// double;
|
|
||||||
return beginMark;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pictureMark = c;
|
if ( separatorMark == -1 || endMark == -1 )
|
||||||
pictureChar = characterRun.text().charAt(0);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (separatorMark == -1 || pictureMark == -1 || endMark == -1)
|
|
||||||
return beginMark;
|
return beginMark;
|
||||||
|
|
||||||
final CharacterRun pictureRun = paragraph.getCharacterRun(pictureMark);
|
processField( hwpfDocument, currentBlock, paragraph, currentTableLevel,
|
||||||
final Picture picture = hwpfDocument.getPicturesTable().extractPicture(
|
beginMark, separatorMark, endMark );
|
||||||
pictureRun, true);
|
|
||||||
|
|
||||||
processImage(currentBlock, pictureChar == 0x01, picture);
|
|
||||||
|
|
||||||
return endMark;
|
return endMark;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Java main() interface to interact with WordToFoExtractor
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* Usage: WordToFoExtractor infile outfile
|
|
||||||
* </p>
|
|
||||||
* Where infile is an input .doc file ( Word 97-2007)
|
|
||||||
* which will be rendered as XSL-FO into outfile
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public static void main(String[] args) {
|
|
||||||
if (args.length < 2) {
|
|
||||||
System.err.println("Usage: WordToFoExtractor <inputFile.doc> <saveTo.fo>");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
System.out.println("Converting " + args[0]);
|
|
||||||
System.out.println("Saving output to " + args[1]);
|
|
||||||
try {
|
|
||||||
Document doc = WordToFoExtractor.process(new File(args[0]));
|
|
||||||
|
|
||||||
FileWriter out = new FileWriter(args[1]);
|
|
||||||
DOMSource domSource = new DOMSource(doc);
|
|
||||||
StreamResult streamResult = new StreamResult(out);
|
|
||||||
TransformerFactory tf = TransformerFactory.newInstance();
|
|
||||||
Transformer serializer = tf.newTransformer();
|
|
||||||
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); // TODO set encoding from a command argument
|
|
||||||
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
|
|
||||||
serializer.transform(domSource, streamResult);
|
|
||||||
out.close();
|
|
||||||
} catch (Exception e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,95 @@
|
||||||
|
/*
|
||||||
|
* ====================================================================
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
* ====================================================================
|
||||||
|
*/
|
||||||
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import java.io.StringWriter;
|
||||||
|
|
||||||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
|
import javax.xml.transform.OutputKeys;
|
||||||
|
import javax.xml.transform.Transformer;
|
||||||
|
import javax.xml.transform.TransformerFactory;
|
||||||
|
import javax.xml.transform.dom.DOMSource;
|
||||||
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.poi.POIDataSamples;
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test cases for {@link WordToFoExtractor}
|
||||||
|
*
|
||||||
|
* @author Sergey Vladimirov (vlsergey {at} gmail {dot} com)
|
||||||
|
*/
|
||||||
|
public class TestWordToFoExtractor extends TestCase
|
||||||
|
{
|
||||||
|
private static String getFoText( final String sampleFileName )
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
HWPFDocument hwpfDocument = new HWPFDocument( POIDataSamples
|
||||||
|
.getDocumentInstance().openResourceAsStream( sampleFileName ) );
|
||||||
|
|
||||||
|
WordToFoExtractor wordToFoExtractor = new WordToFoExtractor(
|
||||||
|
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||||
|
.newDocument() );
|
||||||
|
wordToFoExtractor.processDocument( hwpfDocument );
|
||||||
|
|
||||||
|
StringWriter stringWriter = new StringWriter();
|
||||||
|
|
||||||
|
Transformer transformer = TransformerFactory.newInstance()
|
||||||
|
.newTransformer();
|
||||||
|
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
|
||||||
|
transformer.transform(
|
||||||
|
new DOMSource( wordToFoExtractor.getDocument() ),
|
||||||
|
new StreamResult( stringWriter ) );
|
||||||
|
|
||||||
|
String result = stringWriter.toString();
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHyperlink() throws Exception
|
||||||
|
{
|
||||||
|
final String sampleFileName = "hyperlink.doc";
|
||||||
|
String result = getFoText( sampleFileName );
|
||||||
|
|
||||||
|
assertTrue( result
|
||||||
|
.contains( "<fo:basic-link external-destination=\"http://testuri.org/\">" ) );
|
||||||
|
assertTrue( result.contains( "Hyperlink text" ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEquation() throws Exception
|
||||||
|
{
|
||||||
|
final String sampleFileName = "equation.doc";
|
||||||
|
String result = getFoText( sampleFileName );
|
||||||
|
|
||||||
|
assertTrue( result
|
||||||
|
.contains( "<!--Image link to '0.emf' can be here-->" ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPageref() throws Exception
|
||||||
|
{
|
||||||
|
final String sampleFileName = "pageref.doc";
|
||||||
|
String result = getFoText( sampleFileName );
|
||||||
|
|
||||||
|
System.out.println( result );
|
||||||
|
|
||||||
|
assertTrue( result
|
||||||
|
.contains( "<fo:basic-link internal-destination=\"userref\">" ) );
|
||||||
|
assertTrue( result.contains( "1" ) );
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue