simplify work with fields

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1144337 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-07-08 14:33:01 +00:00
parent efdf16ab0a
commit d7217bd59b
4 changed files with 168 additions and 178 deletions

View File

@ -16,13 +16,16 @@
==================================================================== */ ==================================================================== */
package org.apache.poi.hwpf.converter; package org.apache.poi.hwpf.converter;
import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.poi.ss.formula.functions.Match;
import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.model.Field;
import org.apache.poi.hwpf.model.FieldsTables;
import org.apache.poi.hwpf.model.ListFormatOverride; import org.apache.poi.hwpf.model.ListFormatOverride;
import org.apache.poi.hwpf.model.ListTables; import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.CharacterRun;
@ -55,14 +58,16 @@ public abstract class AbstractWordConverter
CharacterRun characterRun, String text ); CharacterRun characterRun, String text );
protected boolean processCharacters( HWPFDocumentCore hwpfDocument, protected boolean processCharacters( HWPFDocumentCore hwpfDocument,
int currentTableLevel, Paragraph paragraph, final Element block, int currentTableLevel, Range range, final Element block )
List<CharacterRun> characterRuns, final int start, final int end )
{ {
if (range == null)
return false;
boolean haveAnyText = false; boolean haveAnyText = false;
for ( int c = start; c < end; c++ ) for ( int c = 0; c < range.numCharacterRuns(); c++ )
{ {
CharacterRun characterRun = characterRuns.get( c ); CharacterRun characterRun = range.getCharacterRun( c );
if ( characterRun == null ) if ( characterRun == null )
throw new AssertionError(); throw new AssertionError();
@ -86,8 +91,23 @@ public abstract class AbstractWordConverter
if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
{ {
int skipTo = tryField( hwpfDocument, paragraph, if ( hwpfDocument instanceof HWPFDocument )
currentTableLevel, characterRuns, c, block ); {
Field aliveField = ( (HWPFDocument) hwpfDocument )
.getFieldsTables().lookupFieldByStartOffset(
FieldsTables.PLCFFLDMOM,
characterRun.getStartOffset() );
if ( aliveField != null )
{
processField( ( (HWPFDocument) hwpfDocument ), range,
currentTableLevel, aliveField, block );
c = aliveField.getEndOffset();
continue;
}
}
int skipTo = tryDeadField( hwpfDocument, range,
currentTableLevel, c, block );
if ( skipTo != c ) if ( skipTo != c )
{ {
@ -145,91 +165,48 @@ public abstract class AbstractWordConverter
protected abstract void processDocumentInformation( protected abstract void processDocumentInformation(
SummaryInformation summaryInformation ); SummaryInformation summaryInformation );
protected void processField( HWPFDocumentCore wordDocument, protected void processDeadField( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph, int currentTableLevel, Element currentBlock, Range range, int currentTableLevel,
List<CharacterRun> characterRuns, int beginMark, int separatorMark, int beginMark, int separatorMark, int endMark )
int endMark )
{ {
Pattern hyperlinkPattern = Pattern
.compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
Pattern pagerefPattern = Pattern
.compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
if ( separatorMark - beginMark > 1 )
{
int index = beginMark + 1;
CharacterRun firstAfterBegin = null;
while ( index < separatorMark )
{
firstAfterBegin = paragraph.getCharacterRun( index );
if ( firstAfterBegin == null )
{
logger.log( POILogger.WARN,
"Paragraph " + paragraph.getStartOffset() + "--"
+ paragraph.getEndOffset()
+ " contains null CharacterRun #" + index );
index++;
continue;
}
break;
}
if ( firstAfterBegin != null )
{
final Matcher hyperlinkMatcher = hyperlinkPattern
.matcher( firstAfterBegin.text() );
if ( hyperlinkMatcher.matches() )
{
String hyperlink = hyperlinkMatcher.group( 1 );
processHyperlink( wordDocument, currentBlock, paragraph,
characterRuns, currentTableLevel, hyperlink,
separatorMark + 1, endMark );
return;
}
final Matcher pagerefMatcher = pagerefPattern
.matcher( firstAfterBegin.text() );
if ( pagerefMatcher.matches() )
{
String pageref = pagerefMatcher.group( 1 );
processPageref( wordDocument, currentBlock, paragraph,
characterRuns, currentTableLevel, pageref,
separatorMark + 1, endMark );
return;
}
}
}
StringBuilder debug = new StringBuilder( "Unsupported field type: \n" ); StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
for ( int i = beginMark; i <= endMark; i++ ) for ( int i = beginMark; i <= endMark; i++ )
{ {
debug.append( "\t" ); debug.append( "\t" );
debug.append( paragraph.getCharacterRun( i ) ); debug.append( range.getCharacterRun( i ) );
debug.append( "\n" ); debug.append( "\n" );
} }
logger.log( POILogger.WARN, debug ); logger.log( POILogger.WARN, debug );
Range deadFieldValueSubrage = new Range( range.getCharacterRun(
separatorMark ).getStartOffset() + 1, range.getCharacterRun(
endMark ).getStartOffset(), range )
{
@Override
public String toString()
{
return "DeadFieldValueSubrange (" + super.toString() + ")";
}
};
// just output field value // just output field value
if ( separatorMark + 1 < endMark ) if ( separatorMark + 1 < endMark )
processCharacters( wordDocument, currentTableLevel, paragraph, processCharacters( wordDocument, currentTableLevel,
currentBlock, characterRuns, separatorMark + 1, endMark ); deadFieldValueSubrage, currentBlock );
return; return;
} }
protected abstract void processHyperlink( HWPFDocumentCore wordDocument, protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph, Element currentBlock, Range textRange, int currentTableLevel,
List<CharacterRun> characterRuns, int currentTableLevel, String hyperlink );
String hyperlink, int i, int endMark );
protected abstract void processImage( Element currentBlock, protected abstract void processImage( Element currentBlock,
boolean inlined, Picture picture ); boolean inlined, Picture picture );
protected abstract void processPageref( HWPFDocumentCore wordDocument, protected abstract void processPageref( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph, Element currentBlock, Range textRange, int currentTableLevel,
List<CharacterRun> characterRuns, int currentTableLevel, String pageref );
String pageref, int beginTextInclusive, int endTextExclusive );
protected abstract void processParagraph( HWPFDocumentCore wordDocument, protected abstract void processParagraph( HWPFDocumentCore wordDocument,
Element parentFopElement, int currentTableLevel, Element parentFopElement, int currentTableLevel,
@ -317,20 +294,107 @@ public abstract class AbstractWordConverter
protected abstract void processTable( HWPFDocumentCore wordDocument, protected abstract void processTable( HWPFDocumentCore wordDocument,
Element flow, Table table ); Element flow, Table table );
protected int tryField( HWPFDocumentCore wordDocument, Paragraph paragraph, protected Field processField( HWPFDocumentCore wordDocument,
int currentTableLevel, List<CharacterRun> characterRuns, Range charactersRange, int currentTableLevel, int startOffset,
int beginMark, Element currentBlock ) Element currentBlock )
{
if ( !( wordDocument instanceof HWPFDocument ) )
return null;
HWPFDocument hwpfDocument = (HWPFDocument) wordDocument;
Field field = hwpfDocument.getFieldsTables().lookupFieldByStartOffset(
FieldsTables.PLCFFLDMOM, startOffset );
if ( field == null )
return null;
processField( hwpfDocument, charactersRange, currentTableLevel, field,
currentBlock );
return field;
}
protected void processField( HWPFDocument hwpfDocument, Range parentRange,
int currentTableLevel, Field field, Element currentBlock )
{
switch ( field.getType() )
{
case 37: // page reference
{
final Range firstSubrange = field.firstSubrange( parentRange );
if ( firstSubrange != null )
{
String formula = firstSubrange.text();
Pattern pagerefPattern = Pattern
.compile( "[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h[ \\t\\r\\n]*" );
Matcher matcher = pagerefPattern.matcher( formula );
if ( matcher.find() )
{
String pageref = matcher.group( 1 );
processPageref( hwpfDocument, currentBlock,
field.secondSubrange( parentRange ),
currentTableLevel, pageref );
return;
}
}
break;
}
case 88: // hyperlink
{
final Range firstSubrange = field.firstSubrange( parentRange );
if ( firstSubrange != null )
{
String formula = firstSubrange.text();
Pattern hyperlinkPattern = Pattern
.compile( "[ \\t\\r\\n]*HYPERLINK \"(.*)\"[ \\t\\r\\n]*" );
Matcher matcher = hyperlinkPattern.matcher( formula );
if ( matcher.find() )
{
String hyperlink = matcher.group( 1 );
processHyperlink( hwpfDocument, currentBlock,
field.secondSubrange( parentRange ),
currentTableLevel, hyperlink );
return;
}
}
break;
}
}
logger.log( POILogger.WARN, parentRange + " contains " + field
+ " with unsupported type or format" );
processCharacters( hwpfDocument, currentTableLevel,
field.secondSubrange( parentRange ), currentBlock );
}
protected int tryDeadField( HWPFDocumentCore wordDocument, Range range,
int currentTableLevel, int beginMark, Element currentBlock )
{ {
int separatorMark = -1; int separatorMark = -1;
int endMark = -1; int endMark = -1;
for ( int c = beginMark + 1; c < paragraph.numCharacterRuns(); c++ ) for ( int c = beginMark + 1; c < range.numCharacterRuns(); c++ )
{ {
CharacterRun characterRun = paragraph.getCharacterRun( c ); CharacterRun characterRun = range.getCharacterRun( c );
String text = characterRun.text(); String text = characterRun.text();
if ( text.getBytes().length == 0 ) if ( text.getBytes().length == 0 )
continue; continue;
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
{
// nested?
Field possibleField = processField( wordDocument, range,
currentTableLevel, characterRun.getStartOffset(),
currentBlock );
if ( possibleField != null )
{
c = possibleField.getEndOffset();
}
else
{
continue;
}
}
if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK ) if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
{ {
if ( separatorMark != -1 ) if ( separatorMark != -1 )
@ -360,8 +424,8 @@ public abstract class AbstractWordConverter
if ( separatorMark == -1 || endMark == -1 ) if ( separatorMark == -1 || endMark == -1 )
return beginMark; return beginMark;
processField( wordDocument, currentBlock, paragraph, currentTableLevel, processDeadField( wordDocument, currentBlock, range, currentTableLevel,
characterRuns, beginMark, separatorMark, endMark ); beginMark, separatorMark, endMark );
return endMark; return endMark;
} }

View File

@ -22,19 +22,14 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore; import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.HWPFOldDocument; import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException; import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.ListLevel; import org.apache.poi.hwpf.model.ListLevel;
import org.apache.poi.hwpf.model.ListTables; import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.usermodel.BorderCode; import org.apache.poi.hwpf.usermodel.BorderCode;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section; import org.apache.poi.hwpf.usermodel.Section;
@ -55,35 +50,6 @@ public class AbstractWordUtils
return str1 == null ? str2 == null : str1.equals( str2 ); return str1 == null ? str2 == null : str1.equals( str2 );
} }
// XXX incorporate into Range
static List<CharacterRun> findCharacterRuns( Range range )
{
final int min = range.getStartOffset();
final int max = range.getEndOffset();
List<CharacterRun> result = new ArrayList<CharacterRun>();
List<CHPX> chpxs = getCharacters( range );
for ( int i = 0; i < chpxs.size(); i++ )
{
CHPX chpx = chpxs.get( i );
if ( chpx == null )
continue;
if ( Math.max( min, chpx.getStart() ) <= Math.min( max,
chpx.getEnd() ) )
{
final CharacterRun characterRun = getCharacterRun( range, chpx );
if ( characterRun == null )
continue;
result.add( characterRun );
}
}
return result;
}
public static String getBorderType( BorderCode borderCode ) public static String getBorderType( BorderCode borderCode )
{ {
if ( borderCode == null ) if ( borderCode == null )
@ -196,35 +162,6 @@ public class AbstractWordUtils
return bulletBuffer.toString(); return bulletBuffer.toString();
} }
private static CharacterRun getCharacterRun( Range range, CHPX chpx )
{
try
{
Method method = Range.class.getDeclaredMethod( "getCharacterRun",
CHPX.class );
method.setAccessible( true );
return (CharacterRun) method.invoke( range, chpx );
}
catch ( Exception exc )
{
throw new Error( exc );
}
}
private static List<CHPX> getCharacters( Range range )
{
try
{
Field field = Range.class.getDeclaredField( "_characters" );
field.setAccessible( true );
return (List<CHPX>) field.get( range );
}
catch ( Exception exc )
{
throw new Error( exc );
}
}
public static String getColor( int ico ) public static String getColor( int ico )
{ {
switch ( ico ) switch ( ico )

View File

@ -18,7 +18,6 @@ package org.apache.poi.hwpf.converter;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.util.List;
import java.util.Stack; import java.util.Stack;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
@ -34,6 +33,7 @@ import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section; import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.SectionProperties; import org.apache.poi.hwpf.usermodel.SectionProperties;
import org.apache.poi.hwpf.usermodel.Table; import org.apache.poi.hwpf.usermodel.Table;
@ -248,19 +248,17 @@ public class WordToFoConverter extends AbstractWordConverter
foDocumentFacade.setDescription( summaryInformation.getComments() ); foDocumentFacade.setDescription( summaryInformation.getComments() );
} }
protected void processHyperlink( HWPFDocumentCore hwpfDocument, protected void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph, Element currentBlock, Range textRange, int currentTableLevel,
List<CharacterRun> characterRuns, int currentTableLevel, String hyperlink )
String hyperlink, int beginTextInclusive, int endTextExclusive )
{ {
Element basicLink = foDocumentFacade Element basicLink = foDocumentFacade
.createBasicLinkExternal( hyperlink ); .createBasicLinkExternal( hyperlink );
currentBlock.appendChild( basicLink ); currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive ) if ( textRange != null )
processCharacters( hwpfDocument, currentTableLevel, paragraph, processCharacters( wordDocument, currentTableLevel, textRange,
basicLink, characterRuns, beginTextInclusive, basicLink );
endTextExclusive );
} }
/** /**
@ -292,17 +290,15 @@ public class WordToFoConverter extends AbstractWordConverter
} }
protected void processPageref( HWPFDocumentCore hwpfDocument, protected void processPageref( HWPFDocumentCore hwpfDocument,
Element currentBlock, Paragraph paragraph, Element currentBlock, Range textRange, int currentTableLevel,
List<CharacterRun> characterRuns, int currentTableLevel, String pageref )
String pageref, int beginTextInclusive, int endTextExclusive )
{ {
Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref ); Element basicLink = foDocumentFacade.createBasicLinkInternal( pageref );
currentBlock.appendChild( basicLink ); currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive ) if ( textRange != null )
processCharacters( hwpfDocument, currentTableLevel, paragraph, processCharacters( hwpfDocument, currentTableLevel, textRange,
basicLink, characterRuns, beginTextInclusive, basicLink );
endTextExclusive );
} }
protected void processParagraph( HWPFDocumentCore hwpfDocument, protected void processParagraph( HWPFDocumentCore hwpfDocument,
@ -356,10 +352,8 @@ public class WordToFoConverter extends AbstractWordConverter
haveAnyText |= bulletText.trim().length() != 0; haveAnyText |= bulletText.trim().length() != 0;
} }
List<CharacterRun> characterRuns = WordToFoUtils
.findCharacterRuns( paragraph );
haveAnyText = processCharacters( hwpfDocument, currentTableLevel, haveAnyText = processCharacters( hwpfDocument, currentTableLevel,
paragraph, block, characterRuns, 0, characterRuns.size() ); paragraph, block );
if ( !haveAnyText ) if ( !haveAnyText )
{ {

View File

@ -18,7 +18,6 @@ package org.apache.poi.hwpf.converter;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.util.List;
import java.util.Stack; import java.util.Stack;
import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.DocumentBuilderFactory;
@ -34,6 +33,7 @@ import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section; import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.SectionProperties; import org.apache.poi.hwpf.usermodel.SectionProperties;
import org.apache.poi.hwpf.usermodel.Table; import org.apache.poi.hwpf.usermodel.Table;
@ -226,18 +226,17 @@ public class WordToHtmlConverter extends AbstractWordConverter
.addDescription( summaryInformation.getComments() ); .addDescription( summaryInformation.getComments() );
} }
@Override
protected void processHyperlink( HWPFDocumentCore wordDocument, protected void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Paragraph paragraph, Element currentBlock, Range textRange, int currentTableLevel,
List<CharacterRun> characterRuns, int currentTableLevel, String hyperlink )
String hyperlink, int beginTextInclusive, int endTextExclusive )
{ {
Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink ); Element basicLink = htmlDocumentFacade.createHyperlink( hyperlink );
currentBlock.appendChild( basicLink ); currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive ) if ( textRange != null )
processCharacters( wordDocument, currentTableLevel, paragraph, processCharacters( wordDocument, currentTableLevel, textRange,
basicLink, characterRuns, beginTextInclusive, basicLink );
endTextExclusive );
} }
/** /**
@ -266,17 +265,15 @@ public class WordToHtmlConverter extends AbstractWordConverter
} }
protected void processPageref( HWPFDocumentCore hwpfDocument, protected void processPageref( HWPFDocumentCore hwpfDocument,
Element currentBlock, Paragraph paragraph, Element currentBlock, Range textRange, int currentTableLevel,
List<CharacterRun> characterRuns, int currentTableLevel, String pageref )
String pageref, int beginTextInclusive, int endTextExclusive )
{ {
Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref ); Element basicLink = htmlDocumentFacade.createHyperlink( "#" + pageref );
currentBlock.appendChild( basicLink ); currentBlock.appendChild( basicLink );
if ( beginTextInclusive < endTextExclusive ) if ( textRange != null )
processCharacters( hwpfDocument, currentTableLevel, paragraph, processCharacters( hwpfDocument, currentTableLevel, textRange,
basicLink, characterRuns, beginTextInclusive, basicLink );
endTextExclusive );
} }
protected void processParagraph( HWPFDocumentCore hwpfDocument, protected void processParagraph( HWPFDocumentCore hwpfDocument,
@ -322,10 +319,8 @@ public class WordToHtmlConverter extends AbstractWordConverter
pElement.appendChild( textNode ); pElement.appendChild( textNode );
} }
List<CharacterRun> characterRuns = WordToHtmlUtils
.findCharacterRuns( paragraph );
processCharacters( hwpfDocument, currentTableLevel, paragraph, processCharacters( hwpfDocument, currentTableLevel, paragraph,
pElement, characterRuns, 0, characterRuns.size() ); pElement );
} }
finally finally
{ {