add Word-to-Text converter

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155281 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Sergey Vladimirov 2011-08-09 09:25:59 +00:00
parent fa26e746cc
commit 888f51c566
6 changed files with 584 additions and 47 deletions

View File

@ -71,9 +71,9 @@ public abstract class AbstractWordConverter
private static final byte SPECCHAR_DRAWN_OBJECT = 8; private static final byte SPECCHAR_DRAWN_OBJECT = 8;
private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011'; protected static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b'; protected static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
private static void addToStructures( List<Structure> structures, private static void addToStructures( List<Structure> structures,
Structure structure ) Structure structure )
@ -205,7 +205,7 @@ public abstract class AbstractWordConverter
Element currentBlock, Range range, int currentTableLevel, Element currentBlock, Range range, int currentTableLevel,
List<Bookmark> rangeBookmarks ); List<Bookmark> rangeBookmarks );
protected boolean processCharacters( final HWPFDocumentCore document, protected boolean processCharacters( final HWPFDocumentCore wordDocument,
final int currentTableLevel, final Range range, final Element block ) final int currentTableLevel, final Range range, final Element block )
{ {
if ( range == null ) if ( range == null )
@ -220,9 +220,9 @@ public abstract class AbstractWordConverter
* reconstruct the structure of range -- sergey * reconstruct the structure of range -- sergey
*/ */
List<Structure> structures = new LinkedList<Structure>(); List<Structure> structures = new LinkedList<Structure>();
if ( document instanceof HWPFDocument ) if ( wordDocument instanceof HWPFDocument )
{ {
final HWPFDocument doc = (HWPFDocument) document; final HWPFDocument doc = (HWPFDocument) wordDocument;
Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks() Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks()
.getBookmarksStartedBetween( range.getStartOffset(), .getBookmarksStartedBetween( range.getStartOffset(),
@ -247,7 +247,7 @@ public abstract class AbstractWordConverter
CharacterRun characterRun = range.getCharacterRun( c ); CharacterRun characterRun = range.getCharacterRun( c );
if ( characterRun == null ) if ( characterRun == null )
throw new AssertionError(); throw new AssertionError();
Field aliveField = ( (HWPFDocument) document ).getFields() Field aliveField = ( (HWPFDocument) wordDocument ).getFields()
.getFieldByStartOffset( FieldsDocumentPart.MAIN, .getFieldByStartOffset( FieldsDocumentPart.MAIN,
characterRun.getStartOffset() ); characterRun.getStartOffset() );
if ( aliveField != null ) if ( aliveField != null )
@ -273,14 +273,15 @@ public abstract class AbstractWordConverter
return "BetweenStructuresSubrange " + super.toString(); return "BetweenStructuresSubrange " + super.toString();
} }
}; };
processCharacters( document, currentTableLevel, subrange, block ); processCharacters( wordDocument, currentTableLevel, subrange,
block );
} }
if ( structure.structure instanceof Bookmark ) if ( structure.structure instanceof Bookmark )
{ {
// other bookmarks with same bundaries // other bookmarks with same bundaries
List<Bookmark> bookmarks = new LinkedList<Bookmark>(); List<Bookmark> bookmarks = new LinkedList<Bookmark>();
for ( Bookmark bookmark : ( (HWPFDocument) document ) for ( Bookmark bookmark : ( (HWPFDocument) wordDocument )
.getBookmarks() .getBookmarks()
.getBookmarksStartedBetween( structure.start, .getBookmarksStartedBetween( structure.start,
structure.start + 1 ).values().iterator() structure.start + 1 ).values().iterator()
@ -306,7 +307,7 @@ public abstract class AbstractWordConverter
} }
}; };
processBookmarks( document, block, subrange, processBookmarks( wordDocument, block, subrange,
currentTableLevel, bookmarks ); currentTableLevel, bookmarks );
} }
finally finally
@ -317,7 +318,7 @@ public abstract class AbstractWordConverter
else if ( structure.structure instanceof Field ) else if ( structure.structure instanceof Field )
{ {
Field field = (Field) structure.structure; Field field = (Field) structure.structure;
processField( (HWPFDocument) document, range, processField( (HWPFDocument) wordDocument, range,
currentTableLevel, field, block ); currentTableLevel, field, block );
} }
else else
@ -349,7 +350,8 @@ public abstract class AbstractWordConverter
return "AfterStructureSubrange " + super.toString(); return "AfterStructureSubrange " + super.toString();
} }
}; };
processCharacters( document, currentTableLevel, subrange, block ); processCharacters( wordDocument, currentTableLevel, subrange,
block );
} }
return true; return true;
} }
@ -361,11 +363,11 @@ public abstract class AbstractWordConverter
if ( characterRun == null ) if ( characterRun == null )
throw new AssertionError(); throw new AssertionError();
if ( document instanceof HWPFDocument if ( wordDocument instanceof HWPFDocument
&& ( (HWPFDocument) document ).getPicturesTable() && ( (HWPFDocument) wordDocument ).getPicturesTable()
.hasPicture( characterRun ) ) .hasPicture( characterRun ) )
{ {
HWPFDocument newFormat = (HWPFDocument) document; HWPFDocument newFormat = (HWPFDocument) wordDocument;
Picture picture = newFormat.getPicturesTable().extractPicture( Picture picture = newFormat.getPicturesTable().extractPicture(
characterRun, true ); characterRun, true );
@ -381,16 +383,16 @@ public abstract class AbstractWordConverter
if ( characterRun.isSpecialCharacter() ) if ( characterRun.isSpecialCharacter() )
{ {
if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE
&& ( document instanceof HWPFDocument ) ) && ( wordDocument instanceof HWPFDocument ) )
{ {
HWPFDocument doc = (HWPFDocument) document; HWPFDocument doc = (HWPFDocument) wordDocument;
processNoteAnchor( doc, characterRun, block ); processNoteAnchor( doc, characterRun, block );
continue; continue;
} }
if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT
&& ( document instanceof HWPFDocument ) ) && ( wordDocument instanceof HWPFDocument ) )
{ {
HWPFDocument doc = (HWPFDocument) document; HWPFDocument doc = (HWPFDocument) wordDocument;
processDrawnObject( doc, characterRun, block ); processDrawnObject( doc, characterRun, block );
continue; continue;
} }
@ -398,14 +400,15 @@ public abstract class AbstractWordConverter
if ( text.getBytes()[0] == FIELD_BEGIN_MARK ) if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
{ {
if ( document instanceof HWPFDocument ) if ( wordDocument instanceof HWPFDocument )
{ {
Field aliveField = ( (HWPFDocument) document ).getFields() Field aliveField = ( (HWPFDocument) wordDocument )
.getFieldByStartOffset( FieldsDocumentPart.MAIN, .getFields().getFieldByStartOffset(
FieldsDocumentPart.MAIN,
characterRun.getStartOffset() ); characterRun.getStartOffset() );
if ( aliveField != null ) if ( aliveField != null )
{ {
processField( ( (HWPFDocument) document ), range, processField( ( (HWPFDocument) wordDocument ), range,
currentTableLevel, aliveField, block ); currentTableLevel, aliveField, block );
int continueAfter = aliveField.getFieldEndOffset(); int continueAfter = aliveField.getFieldEndOffset();
@ -420,8 +423,8 @@ public abstract class AbstractWordConverter
} }
} }
int skipTo = tryDeadField( document, range, currentTableLevel, int skipTo = tryDeadField( wordDocument, range,
c, block ); currentTableLevel, c, block );
if ( skipTo != c ) if ( skipTo != c )
{ {
@ -610,7 +613,7 @@ public abstract class AbstractWordConverter
CharacterRun characterRun, OfficeDrawing officeDrawing, CharacterRun characterRun, OfficeDrawing officeDrawing,
String path, Element block ); String path, Element block );
protected abstract void processEndnoteAutonumbered( HWPFDocument doc, protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range endnoteTextRange ); int noteIndex, Element block, Range endnoteTextRange );
protected void processField( HWPFDocument hwpfDocument, Range parentRange, protected void processField( HWPFDocument hwpfDocument, Range parentRange,
@ -666,7 +669,7 @@ public abstract class AbstractWordConverter
field.secondSubrange( parentRange ), currentBlock ); field.secondSubrange( parentRange ), currentBlock );
} }
protected abstract void processFootnoteAutonumbered( HWPFDocument doc, protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range footnoteTextRange ); int noteIndex, Element block, Range footnoteTextRange );
protected abstract void processHyperlink( HWPFDocumentCore wordDocument, protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
@ -734,8 +737,8 @@ public abstract class AbstractWordConverter
String pageref ); String pageref );
protected abstract void processParagraph( HWPFDocumentCore wordDocument, protected abstract void processParagraph( HWPFDocumentCore wordDocument,
Element parentFopElement, int currentTableLevel, Element parentElement, int currentTableLevel, Paragraph paragraph,
Paragraph paragraph, String bulletText ); String bulletText );
protected void processParagraphes( HWPFDocumentCore wordDocument, protected void processParagraphes( HWPFDocumentCore wordDocument,
Element flow, Range range, int currentTableLevel ) Element flow, Range range, int currentTableLevel )

View File

@ -0,0 +1,179 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.converter;
import org.apache.poi.util.Beta;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
@Beta
public class TextDocumentFacade
{
protected final Element body;
protected final Document document;
protected final Element head;
protected final Element root;
protected Element title;
protected Text titleText;
public TextDocumentFacade( Document document )
{
this.document = document;
root = document.createElement( "html" );
document.appendChild( root );
body = document.createElement( "body" );
head = document.createElement( "head" );
root.appendChild( head );
root.appendChild( body );
title = document.createElement( "title" );
titleText = document.createTextNode( "" );
head.appendChild( title );
}
public void addAuthor( String value )
{
addMeta( "Author", value );
}
public void addDescription( String value )
{
addMeta( "Description", value );
}
public void addKeywords( String value )
{
addMeta( "Keywords", value );
}
public void addMeta( final String name, String value )
{
Element meta = document.createElement( "meta" );
Element metaName = document.createElement( "name" );
metaName.appendChild( document.createTextNode( name + ": " ) );
meta.appendChild( metaName );
Element metaValue = document.createElement( "value" );
metaValue.appendChild( document.createTextNode( value + "\n" ) );
meta.appendChild( metaValue );
head.appendChild( meta );
}
public Element createBlock()
{
return document.createElement( "div" );
}
public Element createHeader1()
{
Element result = document.createElement( "h1" );
result.appendChild( document.createTextNode( " " ) );
return result;
}
public Element createHeader2()
{
Element result = document.createElement( "h2" );
result.appendChild( document.createTextNode( " " ) );
return result;
}
public Element createParagraph()
{
return document.createElement( "p" );
}
public Element createTable()
{
return document.createElement( "table" );
}
public Element createTableBody()
{
return document.createElement( "tbody" );
}
public Element createTableCell()
{
return document.createElement( "td" );
}
public Element createTableRow()
{
return document.createElement( "tr" );
}
public Text createText( String data )
{
return document.createTextNode( data );
}
public Element createUnorderedList()
{
return document.createElement( "ul" );
}
public Element getBody()
{
return body;
}
public Document getDocument()
{
return document;
}
public Element getHead()
{
return head;
}
public String getTitle()
{
if ( title == null )
return null;
return titleText.getTextContent();
}
public void setTitle( String titleText )
{
if ( WordToHtmlUtils.isEmpty( titleText ) && this.title != null )
{
this.head.removeChild( this.title );
this.title = null;
this.titleText = null;
}
if ( this.title == null )
{
this.title = document.createElement( "title" );
this.titleText = document.createTextNode( titleText );
this.title.appendChild( this.titleText );
this.head.appendChild( title );
}
this.titleText.setData( titleText );
}
}

View File

@ -276,7 +276,7 @@ public class WordToFoConverter extends AbstractWordConverter
} }
@Override @Override
protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex, protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
Element block, Range endnoteTextRange ) Element block, Range endnoteTextRange )
{ {
final String textIndex = String.valueOf( internalLinkCounter final String textIndex = String.valueOf( internalLinkCounter
@ -297,14 +297,14 @@ public class WordToFoConverter extends AbstractWordConverter
setId( backwardLink, forwardLinkName ); setId( backwardLink, forwardLinkName );
endnote.appendChild( backwardLink ); endnote.appendChild( backwardLink );
processCharacters( doc, Integer.MIN_VALUE, endnoteTextRange, endnote ); processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
WordToFoUtils.compactInlines( endnote ); WordToFoUtils.compactInlines( endnote );
this.endnotes.add( endnote ); this.endnotes.add( endnote );
} }
@Override @Override
protected void processFootnoteAutonumbered( HWPFDocument doc, protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range footnoteTextRange ) int noteIndex, Element block, Range footnoteTextRange )
{ {
final String textIndex = String.valueOf( internalLinkCounter final String textIndex = String.valueOf( internalLinkCounter
@ -333,7 +333,7 @@ public class WordToFoConverter extends AbstractWordConverter
footnoteBody.appendChild( footnoteBlock ); footnoteBody.appendChild( footnoteBlock );
footNote.appendChild( footnoteBody ); footNote.appendChild( footnoteBody );
processCharacters( doc, Integer.MIN_VALUE, footnoteTextRange, processCharacters( wordDocument, Integer.MIN_VALUE, footnoteTextRange,
footnoteBlock ); footnoteBlock );
WordToFoUtils.compactInlines( footnoteBlock ); WordToFoUtils.compactInlines( footnoteBlock );

View File

@ -282,17 +282,17 @@ public class WordToHtmlConverter extends AbstractWordConverter
} }
@Override @Override
protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex, protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
Element block, Range endnoteTextRange ) Element block, Range endnoteTextRange )
{ {
processNoteAutonumbered( doc, "end", noteIndex, block, endnoteTextRange ); processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
} }
@Override @Override
protected void processFootnoteAutonumbered( HWPFDocument doc, protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range footnoteTextRange ) int noteIndex, Element block, Range footnoteTextRange )
{ {
processNoteAutonumbered( doc, "foot", noteIndex, block, processNoteAutonumbered( wordDocument, "foot", noteIndex, block,
footnoteTextRange ); footnoteTextRange );
} }
@ -508,11 +508,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
} }
protected void processParagraph( HWPFDocumentCore hwpfDocument, protected void processParagraph( HWPFDocumentCore hwpfDocument,
Element parentFopElement, int currentTableLevel, Element parentElement, int currentTableLevel, Paragraph paragraph,
Paragraph paragraph, String bulletText ) String bulletText )
{ {
final Element pElement = htmlDocumentFacade.createParagraph(); final Element pElement = htmlDocumentFacade.createParagraph();
parentFopElement.appendChild( pElement ); parentElement.appendChild( pElement );
StringBuilder style = new StringBuilder(); StringBuilder style = new StringBuilder();
WordToHtmlUtils.addParagraphProperties( paragraph, style ); WordToHtmlUtils.addParagraphProperties( paragraph, style );

View File

@ -0,0 +1,288 @@
package org.apache.poi.hwpf.converter;
import java.io.File;
import java.io.FileWriter;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.usermodel.Bookmark;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.OfficeDrawing;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.util.Beta;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@Beta
public class WordToTextConverter extends AbstractWordConverter
{
/**
* Java main() interface to interact with {@link WordToTextConverter}
*
* <p>
* Usage: WordToTextConverter infile outfile
* </p>
* Where infile is an input .doc file ( Word 95-2007) which will be rendered
* as plain text into outfile
*/
public static void main( String[] args )
{
if ( args.length < 2 )
{
System.err
.println( "Usage: WordToTextConverter <inputFile.doc> <saveTo.txt>" );
return;
}
System.out.println( "Converting " + args[0] );
System.out.println( "Saving output to " + args[1] );
try
{
Document doc = WordToTextConverter.process( new File( args[0] ) );
FileWriter out = new FileWriter( args[1] );
DOMSource domSource = new DOMSource( doc );
StreamResult streamResult = new StreamResult( out );
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
// TODO set encoding from a command argument
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
serializer.setOutputProperty( OutputKeys.INDENT, "no" );
serializer.setOutputProperty( OutputKeys.METHOD, "text" );
serializer.transform( domSource, streamResult );
out.close();
}
catch ( Exception e )
{
e.printStackTrace();
}
}
static Document process( File docFile ) throws Exception
{
final HWPFDocumentCore wordDocument = AbstractWordUtils
.loadDoc( docFile );
WordToTextConverter wordToTextConverter = new WordToTextConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToTextConverter.processDocument( wordDocument );
return wordToTextConverter.getDocument();
}
private AtomicInteger noteCounters = new AtomicInteger( 1 );
private Element notes = null;
private final TextDocumentFacade textDocumentFacade;
/**
* Creates new instance of {@link WordToTextConverter}. Can be used for
* output several {@link HWPFDocument}s into single text document.
*
* @param document
* XML DOM Document used as storage for text pieces
*/
public WordToTextConverter( Document document )
{
this.textDocumentFacade = new TextDocumentFacade( document );
}
public Document getDocument()
{
return textDocumentFacade.getDocument();
}
@Override
protected void outputCharacters( Element block, CharacterRun characterRun,
String text )
{
block.appendChild( textDocumentFacade.createText( text ) );
}
@Override
protected void processBookmarks( HWPFDocumentCore wordDocument,
Element currentBlock, Range range, int currentTableLevel,
List<Bookmark> rangeBookmarks )
{
processCharacters( wordDocument, currentTableLevel, range, currentBlock );
}
@Override
public void processDocument( HWPFDocumentCore wordDocument )
{
super.processDocument( wordDocument );
if ( notes != null )
textDocumentFacade.getBody().appendChild( notes );
}
@Override
protected void processDocumentInformation(
SummaryInformation summaryInformation )
{
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
textDocumentFacade.setTitle( summaryInformation.getTitle() );
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
textDocumentFacade
.addDescription( summaryInformation.getComments() );
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
}
@Override
protected void processDrawnObject( HWPFDocument doc,
CharacterRun characterRun, OfficeDrawing officeDrawing,
String path, Element block )
{
// ignore
}
@Override
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range endnoteTextRange )
{
processNote( wordDocument, block, endnoteTextRange );
}
@Override
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
int noteIndex, Element block, Range footnoteTextRange )
{
processNote( wordDocument, block, footnoteTextRange );
}
@Override
protected void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
String hyperlink )
{
processCharacters( wordDocument, currentTableLevel, textRange,
currentBlock );
currentBlock.appendChild( textDocumentFacade.createText( " ("
+ UNICODECHAR_ZERO_WIDTH_SPACE
+ hyperlink.replaceAll( "\\/", UNICODECHAR_ZERO_WIDTH_SPACE
+ "\\/" + UNICODECHAR_ZERO_WIDTH_SPACE )
+ UNICODECHAR_ZERO_WIDTH_SPACE + ")" ) );
}
@Override
protected void processImage( Element currentBlock, boolean inlined,
Picture picture )
{
// ignore
}
@Override
protected void processLineBreak( Element block, CharacterRun characterRun )
{
block.appendChild( textDocumentFacade.createText( "\n" ) );
}
protected void processNote( HWPFDocument wordDocument, Element block,
Range noteTextRange )
{
final int noteIndex = noteCounters.getAndIncrement();
block.appendChild( textDocumentFacade
.createText( UNICODECHAR_ZERO_WIDTH_SPACE + "[" + noteIndex
+ "]" + UNICODECHAR_ZERO_WIDTH_SPACE ) );
if ( notes == null )
notes = textDocumentFacade.createBlock();
Element note = textDocumentFacade.createBlock();
notes.appendChild( note );
note.appendChild( textDocumentFacade.createText( "^" + noteIndex
+ "\t " ) );
processCharacters( wordDocument, Integer.MIN_VALUE, noteTextRange, note );
note.appendChild( textDocumentFacade.createText( "\n" ) );
}
@Override
protected void processPageref( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
String pageref )
{
processCharacters( wordDocument, currentTableLevel, textRange,
currentBlock );
}
@Override
protected void processParagraph( HWPFDocumentCore wordDocument,
Element parentElement, int currentTableLevel, Paragraph paragraph,
String bulletText )
{
Element pElement = textDocumentFacade.createParagraph();
pElement.appendChild( textDocumentFacade.createText( bulletText ) );
processCharacters( wordDocument, currentTableLevel, paragraph, pElement );
pElement.appendChild( textDocumentFacade.createText( "\n" ) );
parentElement.appendChild( pElement );
}
@Override
protected void processSection( HWPFDocumentCore wordDocument,
Section section, int s )
{
Element sectionElement = textDocumentFacade.createBlock();
processParagraphes( wordDocument, sectionElement, section,
Integer.MIN_VALUE );
sectionElement.appendChild( textDocumentFacade.createText( "\n" ) );
textDocumentFacade.body.appendChild( sectionElement );
}
protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
Table table )
{
final int tableRows = table.numRows();
for ( int r = 0; r < tableRows; r++ )
{
TableRow tableRow = table.getRow( r );
Element tableRowElement = textDocumentFacade.createTableRow();
final int rowCells = tableRow.numCells();
for ( int c = 0; c < rowCells; c++ )
{
TableCell tableCell = tableRow.getCell( c );
Element tableCellElement = textDocumentFacade.createTableCell();
if ( c != 0 )
tableCellElement.appendChild( textDocumentFacade
.createText( "\t" ) );
processParagraphes( hwpfDocument, tableCellElement, tableCell,
table.getTableLevel() );
tableRowElement.appendChild( tableCellElement );
}
tableRowElement.appendChild( textDocumentFacade.createText( "\n" ) );
flow.appendChild( tableRowElement );
}
}
}

View File

@ -45,7 +45,8 @@ public class TestWordToConverterSuite
public static Test suite() public static Test suite()
{ {
TestSuite suite = new TestSuite(TestWordToConverterSuite.class.getName()); TestSuite suite = new TestSuite(
TestWordToConverterSuite.class.getName() );
File directory = POIDataSamples.getDocumentInstance().getFile( File directory = POIDataSamples.getDocumentInstance().getFile(
"../document" ); "../document" );
@ -63,14 +64,21 @@ public class TestWordToConverterSuite
{ {
public void runTest() throws Exception public void runTest() throws Exception
{ {
test( child, false ); testFo( child );
} }
} ); } );
suite.addTest( new TestCase( name + " [HTML]" ) suite.addTest( new TestCase( name + " [HTML]" )
{ {
public void runTest() throws Exception public void runTest() throws Exception
{ {
test( child, true ); testHtml( child );
}
} );
suite.addTest( new TestCase( name + " [TEXT]" )
{
public void runTest() throws Exception
{
testText( child );
} }
} ); } );
@ -79,7 +87,7 @@ public class TestWordToConverterSuite
return suite; return suite;
} }
protected static void test( File child, boolean html ) throws Exception protected static void testFo( File child ) throws Exception
{ {
HWPFDocumentCore hwpfDocument; HWPFDocumentCore hwpfDocument;
try try
@ -88,7 +96,6 @@ public class TestWordToConverterSuite
} }
catch ( Exception exc ) catch ( Exception exc )
{ {
// unable to parse file -- not WordToFoConverter fault
return; return;
} }
@ -102,14 +109,74 @@ public class TestWordToConverterSuite
Transformer transformer = TransformerFactory.newInstance() Transformer transformer = TransformerFactory.newInstance()
.newTransformer(); .newTransformer();
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" ); transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
transformer.setOutputProperty( OutputKeys.INDENT, "yes" ); transformer.setOutputProperty( OutputKeys.INDENT, "false" );
transformer.transform( transformer.transform(
new DOMSource( wordToFoConverter.getDocument() ), new DOMSource( wordToFoConverter.getDocument() ),
new StreamResult( stringWriter ) ); new StreamResult( stringWriter ) );
if ( html ) // no exceptions
}
protected static void testHtml( File child ) throws Exception
{
HWPFDocumentCore hwpfDocument;
try
{
hwpfDocument = AbstractWordUtils.loadDoc( child );
}
catch ( Exception exc )
{
return;
}
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToHtmlConverter.processDocument( hwpfDocument );
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
transformer.setOutputProperty( OutputKeys.INDENT, "false" );
transformer.setOutputProperty( OutputKeys.METHOD, "html" ); transformer.setOutputProperty( OutputKeys.METHOD, "html" );
transformer.transform(
new DOMSource( wordToHtmlConverter.getDocument() ),
new StreamResult( stringWriter ) );
// no exceptions // no exceptions
} }
protected static void testText( File child ) throws Exception
{
HWPFDocumentCore wordDocument;
try
{
wordDocument = AbstractWordUtils.loadDoc( child );
}
catch ( Exception exc )
{
return;
}
WordToTextConverter wordToTextConverter = new WordToTextConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument() );
wordToTextConverter.processDocument( wordDocument );
StringWriter stringWriter = new StringWriter();
Transformer transformer = TransformerFactory.newInstance()
.newTransformer();
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
transformer.setOutputProperty( OutputKeys.METHOD, "text" );
transformer.transform(
new DOMSource( wordToTextConverter.getDocument() ),
new StreamResult( stringWriter ) );
stringWriter.toString();
// no exceptions
}
} }