add bookmarks support for Word-to-HTML and Word-to-FO converters

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1148824 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-07-20 16:01:19 +00:00
parent d21d9d8fce
commit 4fe8a5ab89
7 changed files with 351 additions and 80 deletions

View File

@ -16,6 +16,12 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.hwpf.converter; package org.apache.poi.hwpf.converter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -23,10 +29,11 @@ import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.FontReplacer.Triplet; import org.apache.poi.hwpf.converter.FontReplacer.Triplet;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.Field; import org.apache.poi.hwpf.model.Field;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.ListFormatOverride; import org.apache.poi.hwpf.model.ListFormatOverride;
import org.apache.poi.hwpf.model.ListTables; import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Picture;
@ -51,6 +58,8 @@ public abstract class AbstractWordConverter
private static final POILogger logger = POILogFactory private static final POILogger logger = POILogFactory
.getLogger( AbstractWordConverter.class ); .getLogger( AbstractWordConverter.class );
private final Set<Bookmark> bookmarkStack = new LinkedHashSet<Bookmark>();
private FontReplacer fontReplacer = new DefaultFontReplacer(); private FontReplacer fontReplacer = new DefaultFontReplacer();
protected Triplet getCharacterRunTriplet( CharacterRun characterRun ) protected Triplet getCharacterRunTriplet( CharacterRun characterRun )
@ -73,7 +82,16 @@ public abstract class AbstractWordConverter
protected abstract void outputCharacters( Element block, protected abstract void outputCharacters( Element block,
CharacterRun characterRun, String text ); CharacterRun characterRun, String text );
protected boolean processCharacters( HWPFDocumentCore hwpfDocument, /**
* Wrap range into bookmark(s) and process it. All bookmarks have starts
* equal to range start and ends equal to range end. Usually it's only one
* bookmark.
*/
protected abstract void processBookmarks( HWPFDocumentCore wordDocument,
Element currentBlock, Range range, int currentTableLevel,
List<Bookmark> rangeBookmarks );
protected boolean processCharacters( HWPFDocumentCore document,
int currentTableLevel, Range range, final Element block ) int currentTableLevel, Range range, final Element block )
{ {
if ( range == null ) if ( range == null )
@ -81,6 +99,22 @@ public abstract class AbstractWordConverter
boolean haveAnyText = false; boolean haveAnyText = false;
if ( document instanceof HWPFDocument )
{
final HWPFDocument doc = (HWPFDocument) document;
Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks()
.getBookmarksStartedBetween( range.getStartOffset(),
range.getEndOffset() );
if ( rangeBookmarks != null && !rangeBookmarks.isEmpty() )
{
boolean processedAny = processRangeBookmarks( doc,
currentTableLevel, range, block, rangeBookmarks );
if ( processedAny )
return true;
}
}
for ( int c = 0; c < range.numCharacterRuns(); c++ ) for ( int c = 0; c < range.numCharacterRuns(); c++ )
{ {
CharacterRun characterRun = range.getCharacterRun( c ); CharacterRun characterRun = range.getCharacterRun( c );
@ -88,11 +122,11 @@ public abstract class AbstractWordConverter
if ( characterRun == null ) if ( characterRun == null )
throw new AssertionError(); throw new AssertionError();
if ( hwpfDocument instanceof HWPFDocument if ( document instanceof HWPFDocument
&& ( (HWPFDocument) hwpfDocument ).getPicturesTable() && ( (HWPFDocument) document ).getPicturesTable()
.hasPicture( characterRun ) ) .hasPicture( characterRun ) )
{ {
HWPFDocument newFormat = (HWPFDocument) hwpfDocument; HWPFDocument newFormat = (HWPFDocument) document;
Picture picture = newFormat.getPicturesTable().extractPicture( Picture picture = newFormat.getPicturesTable().extractPicture(
characterRun, true ); characterRun, true );
@ -107,15 +141,15 @@ public abstract class AbstractWordConverter
if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
{ {
if ( hwpfDocument instanceof HWPFDocument ) if ( document instanceof HWPFDocument )
{ {
Field aliveField = ( (HWPFDocument) hwpfDocument ) Field aliveField = ( (HWPFDocument) document )
.getFieldsTables().lookupFieldByStartOffset( .getFieldsTables().lookupFieldByStartOffset(
FieldsDocumentPart.MAIN, FieldsDocumentPart.MAIN,
characterRun.getStartOffset() ); characterRun.getStartOffset() );
if ( aliveField != null ) if ( aliveField != null )
{ {
processField( ( (HWPFDocument) hwpfDocument ), range, processField( ( (HWPFDocument) document ), range,
currentTableLevel, aliveField, block ); currentTableLevel, aliveField, block );
int continueAfter = aliveField.getFieldEndOffset(); int continueAfter = aliveField.getFieldEndOffset();
@ -130,8 +164,8 @@ public abstract class AbstractWordConverter
} }
} }
int skipTo = tryDeadField( hwpfDocument, range, int skipTo = tryDeadField( document, range, currentTableLevel,
currentTableLevel, c, block ); c, block );
if ( skipTo != c ) if ( skipTo != c )
{ {
@ -337,6 +371,129 @@ public abstract class AbstractWordConverter
Element parentFopElement, int currentTableLevel, Element parentFopElement, int currentTableLevel,
Paragraph paragraph, String bulletText ); Paragraph paragraph, String bulletText );
private boolean processRangeBookmarks( HWPFDocumentCore document,
int currentTableLevel, Range range, final Element block,
Map<Integer, List<Bookmark>> rangeBookmakrs )
{
final int startOffset = range.getStartOffset();
final int endOffset = range.getEndOffset();
int beforeBookmarkStart = startOffset;
for ( Map.Entry<Integer, List<Bookmark>> entry : rangeBookmakrs
.entrySet() )
{
final List<Bookmark> startedAt = entry.getValue();
final List<Bookmark> bookmarks;
if ( entry.getKey().intValue() == startOffset
&& !bookmarkStack.isEmpty() )
{
/*
* we need to filter out some bookmarks because already
* processing them in caller methods
*/
List<Bookmark> filtered = new ArrayList<Bookmark>(
startedAt.size() );
for ( Bookmark bookmark : startedAt )
{
if ( this.bookmarkStack.contains( bookmark ) )
continue;
filtered.add( bookmark );
}
if ( filtered.isEmpty() )
// no bookmarks - skip to next start point
continue;
bookmarks = filtered;
}
else
{
bookmarks = startedAt;
}
// TODO: test me
/*
* we processing only bookmarks with max size, they shall be first
* in sorted list. Other bookmarks will be processed by called
* method
*/
final Bookmark firstBookmark = bookmarks.iterator().next();
final int startBookmarkOffset = firstBookmark.getStart();
final int endBookmarkOffset = Math.min( firstBookmark.getEnd(),
range.getEndOffset() );
List<Bookmark> toProcess = new ArrayList<Bookmark>(
bookmarks.size() );
for ( Bookmark bookmark : bookmarks )
{
if ( Math.min( bookmark.getEnd(), range.getEndOffset() ) != endBookmarkOffset )
break;
toProcess.add( bookmark );
}
if ( beforeBookmarkStart != startBookmarkOffset )
{
// we have range before bookmark
Range beforeBookmarkRange = new Range( beforeBookmarkStart,
startBookmarkOffset, range )
{
@Override
public String toString()
{
return "BeforeBookmarkRange (" + super.toString() + ")";
}
};
processCharacters( document, currentTableLevel,
beforeBookmarkRange, block );
}
Range bookmarkRange = new Range( startBookmarkOffset,
endBookmarkOffset, range )
{
@Override
public String toString()
{
return "BookmarkRange (" + super.toString() + ")";
}
};
bookmarkStack.addAll( toProcess );
try
{
processBookmarks( document, block, bookmarkRange,
currentTableLevel,
Collections.unmodifiableList( toProcess ) );
}
finally
{
bookmarkStack.removeAll( toProcess );
}
beforeBookmarkStart = endBookmarkOffset;
}
if ( beforeBookmarkStart == startOffset )
{
return false;
}
if ( beforeBookmarkStart != endOffset )
{
// we have range after last bookmark
Range afterLastBookmarkRange = new Range( beforeBookmarkStart,
endOffset, range )
{
@Override
public String toString()
{
return "AfterBookmarkRange (" + super.toString() + ")";
}
};
processCharacters( document, currentTableLevel,
afterLastBookmarkRange, block );
}
return true;
}
protected abstract void processSection( HWPFDocumentCore wordDocument, protected abstract void processSection( HWPFDocumentCore wordDocument,
Section section, int s ); Section section, int s );

View File

@ -89,6 +89,13 @@ public class HtmlDocumentFacade
return document.createElement( "div" ); return document.createElement( "div" );
} }
public Element createBookmark( String name )
{
final Element basicLink = document.createElement( "a" );
basicLink.setAttribute( "name", name );
return basicLink;
}
public Element createHeader1() public Element createHeader1()
{ {
return document.createElement( "h1" ); return document.createElement( "h1" );

View File

@ -18,6 +18,7 @@ package org.apache.poi.hwpf.converter;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.util.List;
import java.util.Stack; import java.util.Stack;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
@ -31,6 +32,7 @@ import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.FontReplacer.Triplet; import org.apache.poi.hwpf.converter.FontReplacer.Triplet;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Picture;
@ -51,28 +53,6 @@ import org.w3c.dom.Text;
public class WordToFoConverter extends AbstractWordConverter public class WordToFoConverter extends AbstractWordConverter
{ {
/**
* Holds properties values, applied to current <tt>fo:block</tt> element.
* Those properties shall not be doubled in children <tt>fo:inline</tt>
* elements.
*/
private static class BlockProperies
{
final boolean pBold;
final String pFontName;
final int pFontSize;
final boolean pItalic;
public BlockProperies( String pFontName, int pFontSize, boolean pBold,
boolean pItalic )
{
this.pFontName = pFontName;
this.pFontSize = pFontSize;
this.pBold = pBold;
this.pItalic = pItalic;
}
}
private static final POILogger logger = POILogFactory private static final POILogger logger = POILogFactory
.getLogger( WordToFoConverter.class ); .getLogger( WordToFoConverter.class );
@ -237,6 +217,24 @@ public class WordToFoConverter extends AbstractWordConverter
inline.appendChild( textNode ); inline.appendChild( textNode );
} }
@Override
protected void processBookmarks( HWPFDocumentCore wordDocument,
Element currentBlock, Range range, int currentTableLevel,
List<Bookmark> rangeBookmarks )
{
Element parent = currentBlock;
for ( Bookmark bookmark : rangeBookmarks )
{
Element bookmarkElement = foDocumentFacade.createInline();
bookmarkElement.setAttribute( "id", bookmark.getName() );
parent.appendChild( bookmarkElement );
parent = bookmarkElement;
}
if ( range != null )
processCharacters( wordDocument, currentTableLevel, range, parent );
}
@Override @Override
protected void processDocumentInformation( protected void processDocumentInformation(
SummaryInformation summaryInformation ) SummaryInformation summaryInformation )
@ -509,4 +507,26 @@ public class WordToFoConverter extends AbstractWordConverter
} }
} }
/**
* Holds properties values, applied to current <tt>fo:block</tt> element.
* Those properties shall not be doubled in children <tt>fo:inline</tt>
* elements.
*/
private static class BlockProperies
{
final boolean pBold;
final String pFontName;
final int pFontSize;
final boolean pItalic;
public BlockProperies( String pFontName, int pFontSize, boolean pBold,
boolean pItalic )
{
this.pFontName = pFontName;
this.pFontSize = pFontSize;
this.pBold = pBold;
this.pItalic = pItalic;
}
}
} }

View File

@ -18,6 +18,7 @@ package org.apache.poi.hwpf.converter;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.util.List;
import java.util.Stack; import java.util.Stack;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
@ -31,6 +32,7 @@ import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.FontReplacer.Triplet; import org.apache.poi.hwpf.converter.FontReplacer.Triplet;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Picture;
@ -234,6 +236,24 @@ public class WordToHtmlConverter extends AbstractWordConverter
basicLink ); basicLink );
} }
@Override
protected void processBookmarks( HWPFDocumentCore wordDocument,
Element currentBlock, Range range, int currentTableLevel,
List<Bookmark> rangeBookmarks )
{
Element parent = currentBlock;
for ( Bookmark bookmark : rangeBookmarks )
{
Element bookmarkElement = htmlDocumentFacade
.createBookmark( bookmark.getName() );
parent.appendChild( bookmarkElement );
parent = bookmarkElement;
}
if ( range != null )
processCharacters( wordDocument, currentTableLevel, range, parent );
}
/** /**
* This method shall store image bytes in external file and convert it if * This method shall store image bytes in external file and convert it if
* necessary. Images shall be stored using PNG format. Other formats may be * necessary. Images shall be stored using PNG format. Other formats may be

View File

@ -1,12 +1,33 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.usermodel; package org.apache.poi.hwpf.usermodel;
/**
* User friendly interface to access information about document bookmarks
*
* @author Sergey Vladimirov (vlsergey {at} gmail {doc} com)
*/
public interface Bookmark public interface Bookmark
{ {
public int getEnd(); int getEnd();
public String getName(); String getName();
public int getStart(); int getStart();
public void setName( String name ); void setName( String name );
} }

View File

@ -50,50 +50,7 @@ public class BookmarksImpl implements Bookmarks
private Bookmark getBookmark( final GenericPropertyNode first ) private Bookmark getBookmark( final GenericPropertyNode first )
{ {
return new Bookmark() return new BookmarkImpl( first );
{
public int getEnd()
{
int currentIndex = bookmarksTables
.getDescriptorFirstIndex( first );
try
{
GenericPropertyNode descriptorLim = bookmarksTables
.getDescriptorLim( currentIndex );
return descriptorLim.getStart();
}
catch ( IndexOutOfBoundsException exc )
{
return first.getEnd();
}
}
public String getName()
{
int currentIndex = bookmarksTables
.getDescriptorFirstIndex( first );
try
{
return bookmarksTables.getName( currentIndex );
}
catch ( ArrayIndexOutOfBoundsException exc )
{
return "";
}
}
public int getStart()
{
return first.getStart();
}
public void setName( String name )
{
int currentIndex = bookmarksTables
.getDescriptorFirstIndex( first );
bookmarksTables.setName( currentIndex, name );
}
};
} }
public Bookmark getBookmark( int index ) public Bookmark getBookmark( int index )
@ -143,6 +100,11 @@ public class BookmarksImpl implements Bookmarks
for ( int lookupIndex = startLookupIndex; lookupIndex < endLookupIndex; lookupIndex++ ) for ( int lookupIndex = startLookupIndex; lookupIndex < endLookupIndex; lookupIndex++ )
{ {
int s = sortedStartPositions[lookupIndex]; int s = sortedStartPositions[lookupIndex];
if ( s < startInclusive )
continue;
if ( s >= endExclusive )
break;
List<Bookmark> startedAt = getBookmarksAt( s ); List<Bookmark> startedAt = getBookmarksAt( s );
if ( startedAt != null ) if ( startedAt != null )
result.put( Integer.valueOf( s ), startedAt ); result.put( Integer.valueOf( s ), startedAt );
@ -186,4 +148,87 @@ public class BookmarksImpl implements Bookmarks
this.sortedDescriptors = result; this.sortedDescriptors = result;
this.sortedStartPositions = indices; this.sortedStartPositions = indices;
} }
private final class BookmarkImpl implements Bookmark
{
private final GenericPropertyNode first;
private BookmarkImpl( GenericPropertyNode first )
{
this.first = first;
}
@Override
public boolean equals( Object obj )
{
if ( this == obj )
return true;
if ( obj == null )
return false;
if ( getClass() != obj.getClass() )
return false;
BookmarkImpl other = (BookmarkImpl) obj;
if ( first == null )
{
if ( other.first != null )
return false;
}
else if ( !first.equals( other.first ) )
return false;
return true;
}
public int getEnd()
{
int currentIndex = bookmarksTables.getDescriptorFirstIndex( first );
try
{
GenericPropertyNode descriptorLim = bookmarksTables
.getDescriptorLim( currentIndex );
return descriptorLim.getStart();
}
catch ( IndexOutOfBoundsException exc )
{
return first.getEnd();
}
}
public String getName()
{
int currentIndex = bookmarksTables.getDescriptorFirstIndex( first );
try
{
return bookmarksTables.getName( currentIndex );
}
catch ( ArrayIndexOutOfBoundsException exc )
{
return "";
}
}
public int getStart()
{
return first.getStart();
}
@Override
public int hashCode()
{
return 31 + ( first == null ? 0 : first.hashCode() );
}
public void setName( String name )
{
int currentIndex = bookmarksTables.getDescriptorFirstIndex( first );
bookmarksTables.setName( currentIndex, name );
}
@Override
public String toString()
{
return "Bookmark [" + getStart() + "; " + getEnd() + "): name: "
+ getName();
}
}
} }

View File

@ -187,6 +187,7 @@ public class TestWordToHtmlConverter extends TestCase
String result = getHtmlText( "pageref.doc" ); String result = getHtmlText( "pageref.doc" );
assertContains( result, "<a href=\"#userref\">" ); assertContains( result, "<a href=\"#userref\">" );
assertContains( result, "<a name=\"userref\">" );
assertContains( result, "1" ); assertContains( result, "1" );
} }
} }