mirror of https://github.com/apache/poi.git
add Word-to-Text converter
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155281 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fa26e746cc
commit
888f51c566
|
@ -71,9 +71,9 @@ public abstract class AbstractWordConverter
|
|||
|
||||
private static final byte SPECCHAR_DRAWN_OBJECT = 8;
|
||||
|
||||
private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
|
||||
protected static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
|
||||
|
||||
private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
|
||||
protected static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
|
||||
|
||||
private static void addToStructures( List<Structure> structures,
|
||||
Structure structure )
|
||||
|
@ -205,7 +205,7 @@ public abstract class AbstractWordConverter
|
|||
Element currentBlock, Range range, int currentTableLevel,
|
||||
List<Bookmark> rangeBookmarks );
|
||||
|
||||
protected boolean processCharacters( final HWPFDocumentCore document,
|
||||
protected boolean processCharacters( final HWPFDocumentCore wordDocument,
|
||||
final int currentTableLevel, final Range range, final Element block )
|
||||
{
|
||||
if ( range == null )
|
||||
|
@ -220,9 +220,9 @@ public abstract class AbstractWordConverter
|
|||
* reconstruct the structure of range -- sergey
|
||||
*/
|
||||
List<Structure> structures = new LinkedList<Structure>();
|
||||
if ( document instanceof HWPFDocument )
|
||||
if ( wordDocument instanceof HWPFDocument )
|
||||
{
|
||||
final HWPFDocument doc = (HWPFDocument) document;
|
||||
final HWPFDocument doc = (HWPFDocument) wordDocument;
|
||||
|
||||
Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks()
|
||||
.getBookmarksStartedBetween( range.getStartOffset(),
|
||||
|
@ -247,7 +247,7 @@ public abstract class AbstractWordConverter
|
|||
CharacterRun characterRun = range.getCharacterRun( c );
|
||||
if ( characterRun == null )
|
||||
throw new AssertionError();
|
||||
Field aliveField = ( (HWPFDocument) document ).getFields()
|
||||
Field aliveField = ( (HWPFDocument) wordDocument ).getFields()
|
||||
.getFieldByStartOffset( FieldsDocumentPart.MAIN,
|
||||
characterRun.getStartOffset() );
|
||||
if ( aliveField != null )
|
||||
|
@ -273,14 +273,15 @@ public abstract class AbstractWordConverter
|
|||
return "BetweenStructuresSubrange " + super.toString();
|
||||
}
|
||||
};
|
||||
processCharacters( document, currentTableLevel, subrange, block );
|
||||
processCharacters( wordDocument, currentTableLevel, subrange,
|
||||
block );
|
||||
}
|
||||
|
||||
if ( structure.structure instanceof Bookmark )
|
||||
{
|
||||
// other bookmarks with same bundaries
|
||||
List<Bookmark> bookmarks = new LinkedList<Bookmark>();
|
||||
for ( Bookmark bookmark : ( (HWPFDocument) document )
|
||||
for ( Bookmark bookmark : ( (HWPFDocument) wordDocument )
|
||||
.getBookmarks()
|
||||
.getBookmarksStartedBetween( structure.start,
|
||||
structure.start + 1 ).values().iterator()
|
||||
|
@ -306,7 +307,7 @@ public abstract class AbstractWordConverter
|
|||
}
|
||||
};
|
||||
|
||||
processBookmarks( document, block, subrange,
|
||||
processBookmarks( wordDocument, block, subrange,
|
||||
currentTableLevel, bookmarks );
|
||||
}
|
||||
finally
|
||||
|
@ -317,7 +318,7 @@ public abstract class AbstractWordConverter
|
|||
else if ( structure.structure instanceof Field )
|
||||
{
|
||||
Field field = (Field) structure.structure;
|
||||
processField( (HWPFDocument) document, range,
|
||||
processField( (HWPFDocument) wordDocument, range,
|
||||
currentTableLevel, field, block );
|
||||
}
|
||||
else
|
||||
|
@ -349,7 +350,8 @@ public abstract class AbstractWordConverter
|
|||
return "AfterStructureSubrange " + super.toString();
|
||||
}
|
||||
};
|
||||
processCharacters( document, currentTableLevel, subrange, block );
|
||||
processCharacters( wordDocument, currentTableLevel, subrange,
|
||||
block );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -361,11 +363,11 @@ public abstract class AbstractWordConverter
|
|||
if ( characterRun == null )
|
||||
throw new AssertionError();
|
||||
|
||||
if ( document instanceof HWPFDocument
|
||||
&& ( (HWPFDocument) document ).getPicturesTable()
|
||||
if ( wordDocument instanceof HWPFDocument
|
||||
&& ( (HWPFDocument) wordDocument ).getPicturesTable()
|
||||
.hasPicture( characterRun ) )
|
||||
{
|
||||
HWPFDocument newFormat = (HWPFDocument) document;
|
||||
HWPFDocument newFormat = (HWPFDocument) wordDocument;
|
||||
Picture picture = newFormat.getPicturesTable().extractPicture(
|
||||
characterRun, true );
|
||||
|
||||
|
@ -381,16 +383,16 @@ public abstract class AbstractWordConverter
|
|||
if ( characterRun.isSpecialCharacter() )
|
||||
{
|
||||
if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE
|
||||
&& ( document instanceof HWPFDocument ) )
|
||||
&& ( wordDocument instanceof HWPFDocument ) )
|
||||
{
|
||||
HWPFDocument doc = (HWPFDocument) document;
|
||||
HWPFDocument doc = (HWPFDocument) wordDocument;
|
||||
processNoteAnchor( doc, characterRun, block );
|
||||
continue;
|
||||
}
|
||||
if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT
|
||||
&& ( document instanceof HWPFDocument ) )
|
||||
&& ( wordDocument instanceof HWPFDocument ) )
|
||||
{
|
||||
HWPFDocument doc = (HWPFDocument) document;
|
||||
HWPFDocument doc = (HWPFDocument) wordDocument;
|
||||
processDrawnObject( doc, characterRun, block );
|
||||
continue;
|
||||
}
|
||||
|
@ -398,14 +400,15 @@ public abstract class AbstractWordConverter
|
|||
|
||||
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
|
||||
{
|
||||
if ( document instanceof HWPFDocument )
|
||||
if ( wordDocument instanceof HWPFDocument )
|
||||
{
|
||||
Field aliveField = ( (HWPFDocument) document ).getFields()
|
||||
.getFieldByStartOffset( FieldsDocumentPart.MAIN,
|
||||
Field aliveField = ( (HWPFDocument) wordDocument )
|
||||
.getFields().getFieldByStartOffset(
|
||||
FieldsDocumentPart.MAIN,
|
||||
characterRun.getStartOffset() );
|
||||
if ( aliveField != null )
|
||||
{
|
||||
processField( ( (HWPFDocument) document ), range,
|
||||
processField( ( (HWPFDocument) wordDocument ), range,
|
||||
currentTableLevel, aliveField, block );
|
||||
|
||||
int continueAfter = aliveField.getFieldEndOffset();
|
||||
|
@ -420,8 +423,8 @@ public abstract class AbstractWordConverter
|
|||
}
|
||||
}
|
||||
|
||||
int skipTo = tryDeadField( document, range, currentTableLevel,
|
||||
c, block );
|
||||
int skipTo = tryDeadField( wordDocument, range,
|
||||
currentTableLevel, c, block );
|
||||
|
||||
if ( skipTo != c )
|
||||
{
|
||||
|
@ -610,7 +613,7 @@ public abstract class AbstractWordConverter
|
|||
CharacterRun characterRun, OfficeDrawing officeDrawing,
|
||||
String path, Element block );
|
||||
|
||||
protected abstract void processEndnoteAutonumbered( HWPFDocument doc,
|
||||
protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range endnoteTextRange );
|
||||
|
||||
protected void processField( HWPFDocument hwpfDocument, Range parentRange,
|
||||
|
@ -666,7 +669,7 @@ public abstract class AbstractWordConverter
|
|||
field.secondSubrange( parentRange ), currentBlock );
|
||||
}
|
||||
|
||||
protected abstract void processFootnoteAutonumbered( HWPFDocument doc,
|
||||
protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range footnoteTextRange );
|
||||
|
||||
protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
|
||||
|
@ -734,8 +737,8 @@ public abstract class AbstractWordConverter
|
|||
String pageref );
|
||||
|
||||
protected abstract void processParagraph( HWPFDocumentCore wordDocument,
|
||||
Element parentFopElement, int currentTableLevel,
|
||||
Paragraph paragraph, String bulletText );
|
||||
Element parentElement, int currentTableLevel, Paragraph paragraph,
|
||||
String bulletText );
|
||||
|
||||
protected void processParagraphes( HWPFDocumentCore wordDocument,
|
||||
Element flow, Range range, int currentTableLevel )
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hwpf.converter;
|
||||
|
||||
import org.apache.poi.util.Beta;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Text;
|
||||
|
||||
@Beta
|
||||
public class TextDocumentFacade
|
||||
{
|
||||
protected final Element body;
|
||||
protected final Document document;
|
||||
protected final Element head;
|
||||
protected final Element root;
|
||||
|
||||
protected Element title;
|
||||
protected Text titleText;
|
||||
|
||||
public TextDocumentFacade( Document document )
|
||||
{
|
||||
this.document = document;
|
||||
|
||||
root = document.createElement( "html" );
|
||||
document.appendChild( root );
|
||||
|
||||
body = document.createElement( "body" );
|
||||
head = document.createElement( "head" );
|
||||
|
||||
root.appendChild( head );
|
||||
root.appendChild( body );
|
||||
|
||||
title = document.createElement( "title" );
|
||||
titleText = document.createTextNode( "" );
|
||||
head.appendChild( title );
|
||||
}
|
||||
|
||||
public void addAuthor( String value )
|
||||
{
|
||||
addMeta( "Author", value );
|
||||
}
|
||||
|
||||
public void addDescription( String value )
|
||||
{
|
||||
addMeta( "Description", value );
|
||||
}
|
||||
|
||||
public void addKeywords( String value )
|
||||
{
|
||||
addMeta( "Keywords", value );
|
||||
}
|
||||
|
||||
public void addMeta( final String name, String value )
|
||||
{
|
||||
Element meta = document.createElement( "meta" );
|
||||
|
||||
Element metaName = document.createElement( "name" );
|
||||
metaName.appendChild( document.createTextNode( name + ": " ) );
|
||||
meta.appendChild( metaName );
|
||||
|
||||
Element metaValue = document.createElement( "value" );
|
||||
metaValue.appendChild( document.createTextNode( value + "\n" ) );
|
||||
meta.appendChild( metaValue );
|
||||
|
||||
head.appendChild( meta );
|
||||
}
|
||||
|
||||
public Element createBlock()
|
||||
{
|
||||
return document.createElement( "div" );
|
||||
}
|
||||
|
||||
public Element createHeader1()
|
||||
{
|
||||
Element result = document.createElement( "h1" );
|
||||
result.appendChild( document.createTextNode( " " ) );
|
||||
return result;
|
||||
}
|
||||
|
||||
public Element createHeader2()
|
||||
{
|
||||
Element result = document.createElement( "h2" );
|
||||
result.appendChild( document.createTextNode( " " ) );
|
||||
return result;
|
||||
}
|
||||
|
||||
public Element createParagraph()
|
||||
{
|
||||
return document.createElement( "p" );
|
||||
}
|
||||
|
||||
public Element createTable()
|
||||
{
|
||||
return document.createElement( "table" );
|
||||
}
|
||||
|
||||
public Element createTableBody()
|
||||
{
|
||||
return document.createElement( "tbody" );
|
||||
}
|
||||
|
||||
public Element createTableCell()
|
||||
{
|
||||
return document.createElement( "td" );
|
||||
}
|
||||
|
||||
public Element createTableRow()
|
||||
{
|
||||
return document.createElement( "tr" );
|
||||
}
|
||||
|
||||
public Text createText( String data )
|
||||
{
|
||||
return document.createTextNode( data );
|
||||
}
|
||||
|
||||
public Element createUnorderedList()
|
||||
{
|
||||
return document.createElement( "ul" );
|
||||
}
|
||||
|
||||
public Element getBody()
|
||||
{
|
||||
return body;
|
||||
}
|
||||
|
||||
public Document getDocument()
|
||||
{
|
||||
return document;
|
||||
}
|
||||
|
||||
public Element getHead()
|
||||
{
|
||||
return head;
|
||||
}
|
||||
|
||||
public String getTitle()
|
||||
{
|
||||
if ( title == null )
|
||||
return null;
|
||||
|
||||
return titleText.getTextContent();
|
||||
}
|
||||
|
||||
public void setTitle( String titleText )
|
||||
{
|
||||
if ( WordToHtmlUtils.isEmpty( titleText ) && this.title != null )
|
||||
{
|
||||
this.head.removeChild( this.title );
|
||||
this.title = null;
|
||||
this.titleText = null;
|
||||
}
|
||||
|
||||
if ( this.title == null )
|
||||
{
|
||||
this.title = document.createElement( "title" );
|
||||
this.titleText = document.createTextNode( titleText );
|
||||
this.title.appendChild( this.titleText );
|
||||
this.head.appendChild( title );
|
||||
}
|
||||
|
||||
this.titleText.setData( titleText );
|
||||
}
|
||||
}
|
|
@ -276,7 +276,7 @@ public class WordToFoConverter extends AbstractWordConverter
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex,
|
||||
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
|
||||
Element block, Range endnoteTextRange )
|
||||
{
|
||||
final String textIndex = String.valueOf( internalLinkCounter
|
||||
|
@ -297,14 +297,14 @@ public class WordToFoConverter extends AbstractWordConverter
|
|||
setId( backwardLink, forwardLinkName );
|
||||
endnote.appendChild( backwardLink );
|
||||
|
||||
processCharacters( doc, Integer.MIN_VALUE, endnoteTextRange, endnote );
|
||||
processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
|
||||
|
||||
WordToFoUtils.compactInlines( endnote );
|
||||
this.endnotes.add( endnote );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processFootnoteAutonumbered( HWPFDocument doc,
|
||||
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range footnoteTextRange )
|
||||
{
|
||||
final String textIndex = String.valueOf( internalLinkCounter
|
||||
|
@ -333,7 +333,7 @@ public class WordToFoConverter extends AbstractWordConverter
|
|||
footnoteBody.appendChild( footnoteBlock );
|
||||
footNote.appendChild( footnoteBody );
|
||||
|
||||
processCharacters( doc, Integer.MIN_VALUE, footnoteTextRange,
|
||||
processCharacters( wordDocument, Integer.MIN_VALUE, footnoteTextRange,
|
||||
footnoteBlock );
|
||||
|
||||
WordToFoUtils.compactInlines( footnoteBlock );
|
||||
|
|
|
@ -282,17 +282,17 @@ public class WordToHtmlConverter extends AbstractWordConverter
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex,
|
||||
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
|
||||
Element block, Range endnoteTextRange )
|
||||
{
|
||||
processNoteAutonumbered( doc, "end", noteIndex, block, endnoteTextRange );
|
||||
processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processFootnoteAutonumbered( HWPFDocument doc,
|
||||
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range footnoteTextRange )
|
||||
{
|
||||
processNoteAutonumbered( doc, "foot", noteIndex, block,
|
||||
processNoteAutonumbered( wordDocument, "foot", noteIndex, block,
|
||||
footnoteTextRange );
|
||||
}
|
||||
|
||||
|
@ -508,11 +508,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
|
|||
}
|
||||
|
||||
protected void processParagraph( HWPFDocumentCore hwpfDocument,
|
||||
Element parentFopElement, int currentTableLevel,
|
||||
Paragraph paragraph, String bulletText )
|
||||
Element parentElement, int currentTableLevel, Paragraph paragraph,
|
||||
String bulletText )
|
||||
{
|
||||
final Element pElement = htmlDocumentFacade.createParagraph();
|
||||
parentFopElement.appendChild( pElement );
|
||||
parentElement.appendChild( pElement );
|
||||
|
||||
StringBuilder style = new StringBuilder();
|
||||
WordToHtmlUtils.addParagraphProperties( paragraph, style );
|
||||
|
|
|
@ -0,0 +1,288 @@
|
|||
package org.apache.poi.hwpf.converter;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.apache.poi.hpsf.SummaryInformation;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.HWPFDocumentCore;
|
||||
import org.apache.poi.hwpf.usermodel.Bookmark;
|
||||
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
||||
import org.apache.poi.hwpf.usermodel.OfficeDrawing;
|
||||
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||
import org.apache.poi.hwpf.usermodel.Picture;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.hwpf.usermodel.Section;
|
||||
import org.apache.poi.hwpf.usermodel.Table;
|
||||
import org.apache.poi.hwpf.usermodel.TableCell;
|
||||
import org.apache.poi.hwpf.usermodel.TableRow;
|
||||
import org.apache.poi.util.Beta;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
@Beta
|
||||
public class WordToTextConverter extends AbstractWordConverter
|
||||
{
|
||||
|
||||
/**
|
||||
* Java main() interface to interact with {@link WordToTextConverter}
|
||||
*
|
||||
* <p>
|
||||
* Usage: WordToTextConverter infile outfile
|
||||
* </p>
|
||||
* Where infile is an input .doc file ( Word 95-2007) which will be rendered
|
||||
* as plain text into outfile
|
||||
*/
|
||||
public static void main( String[] args )
|
||||
{
|
||||
if ( args.length < 2 )
|
||||
{
|
||||
System.err
|
||||
.println( "Usage: WordToTextConverter <inputFile.doc> <saveTo.txt>" );
|
||||
return;
|
||||
}
|
||||
|
||||
System.out.println( "Converting " + args[0] );
|
||||
System.out.println( "Saving output to " + args[1] );
|
||||
try
|
||||
{
|
||||
Document doc = WordToTextConverter.process( new File( args[0] ) );
|
||||
|
||||
FileWriter out = new FileWriter( args[1] );
|
||||
DOMSource domSource = new DOMSource( doc );
|
||||
StreamResult streamResult = new StreamResult( out );
|
||||
|
||||
TransformerFactory tf = TransformerFactory.newInstance();
|
||||
Transformer serializer = tf.newTransformer();
|
||||
// TODO set encoding from a command argument
|
||||
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
|
||||
serializer.setOutputProperty( OutputKeys.INDENT, "no" );
|
||||
serializer.setOutputProperty( OutputKeys.METHOD, "text" );
|
||||
serializer.transform( domSource, streamResult );
|
||||
out.close();
|
||||
}
|
||||
catch ( Exception e )
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
static Document process( File docFile ) throws Exception
|
||||
{
|
||||
final HWPFDocumentCore wordDocument = AbstractWordUtils
|
||||
.loadDoc( docFile );
|
||||
WordToTextConverter wordToTextConverter = new WordToTextConverter(
|
||||
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||
.newDocument() );
|
||||
wordToTextConverter.processDocument( wordDocument );
|
||||
return wordToTextConverter.getDocument();
|
||||
}
|
||||
|
||||
private AtomicInteger noteCounters = new AtomicInteger( 1 );
|
||||
|
||||
private Element notes = null;
|
||||
|
||||
private final TextDocumentFacade textDocumentFacade;
|
||||
|
||||
/**
|
||||
* Creates new instance of {@link WordToTextConverter}. Can be used for
|
||||
* output several {@link HWPFDocument}s into single text document.
|
||||
*
|
||||
* @param document
|
||||
* XML DOM Document used as storage for text pieces
|
||||
*/
|
||||
public WordToTextConverter( Document document )
|
||||
{
|
||||
this.textDocumentFacade = new TextDocumentFacade( document );
|
||||
}
|
||||
|
||||
public Document getDocument()
|
||||
{
|
||||
return textDocumentFacade.getDocument();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void outputCharacters( Element block, CharacterRun characterRun,
|
||||
String text )
|
||||
{
|
||||
block.appendChild( textDocumentFacade.createText( text ) );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processBookmarks( HWPFDocumentCore wordDocument,
|
||||
Element currentBlock, Range range, int currentTableLevel,
|
||||
List<Bookmark> rangeBookmarks )
|
||||
{
|
||||
processCharacters( wordDocument, currentTableLevel, range, currentBlock );
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processDocument( HWPFDocumentCore wordDocument )
|
||||
{
|
||||
super.processDocument( wordDocument );
|
||||
|
||||
if ( notes != null )
|
||||
textDocumentFacade.getBody().appendChild( notes );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processDocumentInformation(
|
||||
SummaryInformation summaryInformation )
|
||||
{
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
|
||||
textDocumentFacade.setTitle( summaryInformation.getTitle() );
|
||||
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
|
||||
textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
|
||||
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
|
||||
textDocumentFacade
|
||||
.addDescription( summaryInformation.getComments() );
|
||||
|
||||
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
|
||||
textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processDrawnObject( HWPFDocument doc,
|
||||
CharacterRun characterRun, OfficeDrawing officeDrawing,
|
||||
String path, Element block )
|
||||
{
|
||||
// ignore
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range endnoteTextRange )
|
||||
{
|
||||
processNote( wordDocument, block, endnoteTextRange );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||
int noteIndex, Element block, Range footnoteTextRange )
|
||||
{
|
||||
processNote( wordDocument, block, footnoteTextRange );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processHyperlink( HWPFDocumentCore wordDocument,
|
||||
Element currentBlock, Range textRange, int currentTableLevel,
|
||||
String hyperlink )
|
||||
{
|
||||
processCharacters( wordDocument, currentTableLevel, textRange,
|
||||
currentBlock );
|
||||
|
||||
currentBlock.appendChild( textDocumentFacade.createText( " ("
|
||||
+ UNICODECHAR_ZERO_WIDTH_SPACE
|
||||
+ hyperlink.replaceAll( "\\/", UNICODECHAR_ZERO_WIDTH_SPACE
|
||||
+ "\\/" + UNICODECHAR_ZERO_WIDTH_SPACE )
|
||||
+ UNICODECHAR_ZERO_WIDTH_SPACE + ")" ) );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processImage( Element currentBlock, boolean inlined,
|
||||
Picture picture )
|
||||
{
|
||||
// ignore
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processLineBreak( Element block, CharacterRun characterRun )
|
||||
{
|
||||
block.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||
}
|
||||
|
||||
protected void processNote( HWPFDocument wordDocument, Element block,
|
||||
Range noteTextRange )
|
||||
{
|
||||
final int noteIndex = noteCounters.getAndIncrement();
|
||||
block.appendChild( textDocumentFacade
|
||||
.createText( UNICODECHAR_ZERO_WIDTH_SPACE + "[" + noteIndex
|
||||
+ "]" + UNICODECHAR_ZERO_WIDTH_SPACE ) );
|
||||
|
||||
if ( notes == null )
|
||||
notes = textDocumentFacade.createBlock();
|
||||
|
||||
Element note = textDocumentFacade.createBlock();
|
||||
notes.appendChild( note );
|
||||
|
||||
note.appendChild( textDocumentFacade.createText( "^" + noteIndex
|
||||
+ "\t " ) );
|
||||
processCharacters( wordDocument, Integer.MIN_VALUE, noteTextRange, note );
|
||||
note.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processPageref( HWPFDocumentCore wordDocument,
|
||||
Element currentBlock, Range textRange, int currentTableLevel,
|
||||
String pageref )
|
||||
{
|
||||
processCharacters( wordDocument, currentTableLevel, textRange,
|
||||
currentBlock );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processParagraph( HWPFDocumentCore wordDocument,
|
||||
Element parentElement, int currentTableLevel, Paragraph paragraph,
|
||||
String bulletText )
|
||||
{
|
||||
Element pElement = textDocumentFacade.createParagraph();
|
||||
pElement.appendChild( textDocumentFacade.createText( bulletText ) );
|
||||
processCharacters( wordDocument, currentTableLevel, paragraph, pElement );
|
||||
pElement.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||
parentElement.appendChild( pElement );
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processSection( HWPFDocumentCore wordDocument,
|
||||
Section section, int s )
|
||||
{
|
||||
Element sectionElement = textDocumentFacade.createBlock();
|
||||
processParagraphes( wordDocument, sectionElement, section,
|
||||
Integer.MIN_VALUE );
|
||||
sectionElement.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||
textDocumentFacade.body.appendChild( sectionElement );
|
||||
}
|
||||
|
||||
protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
|
||||
Table table )
|
||||
{
|
||||
final int tableRows = table.numRows();
|
||||
for ( int r = 0; r < tableRows; r++ )
|
||||
{
|
||||
TableRow tableRow = table.getRow( r );
|
||||
|
||||
Element tableRowElement = textDocumentFacade.createTableRow();
|
||||
|
||||
final int rowCells = tableRow.numCells();
|
||||
for ( int c = 0; c < rowCells; c++ )
|
||||
{
|
||||
TableCell tableCell = tableRow.getCell( c );
|
||||
|
||||
Element tableCellElement = textDocumentFacade.createTableCell();
|
||||
|
||||
if ( c != 0 )
|
||||
tableCellElement.appendChild( textDocumentFacade
|
||||
.createText( "\t" ) );
|
||||
|
||||
processParagraphes( hwpfDocument, tableCellElement, tableCell,
|
||||
table.getTableLevel() );
|
||||
tableRowElement.appendChild( tableCellElement );
|
||||
}
|
||||
|
||||
tableRowElement.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||
flow.appendChild( tableRowElement );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -45,7 +45,8 @@ public class TestWordToConverterSuite
|
|||
|
||||
public static Test suite()
|
||||
{
|
||||
TestSuite suite = new TestSuite(TestWordToConverterSuite.class.getName());
|
||||
TestSuite suite = new TestSuite(
|
||||
TestWordToConverterSuite.class.getName() );
|
||||
|
||||
File directory = POIDataSamples.getDocumentInstance().getFile(
|
||||
"../document" );
|
||||
|
@ -63,14 +64,21 @@ public class TestWordToConverterSuite
|
|||
{
|
||||
public void runTest() throws Exception
|
||||
{
|
||||
test( child, false );
|
||||
testFo( child );
|
||||
}
|
||||
} );
|
||||
suite.addTest( new TestCase( name + " [HTML]" )
|
||||
{
|
||||
public void runTest() throws Exception
|
||||
{
|
||||
test( child, true );
|
||||
testHtml( child );
|
||||
}
|
||||
} );
|
||||
suite.addTest( new TestCase( name + " [TEXT]" )
|
||||
{
|
||||
public void runTest() throws Exception
|
||||
{
|
||||
testText( child );
|
||||
}
|
||||
} );
|
||||
|
||||
|
@ -79,7 +87,7 @@ public class TestWordToConverterSuite
|
|||
return suite;
|
||||
}
|
||||
|
||||
protected static void test( File child, boolean html ) throws Exception
|
||||
protected static void testFo( File child ) throws Exception
|
||||
{
|
||||
HWPFDocumentCore hwpfDocument;
|
||||
try
|
||||
|
@ -88,7 +96,6 @@ public class TestWordToConverterSuite
|
|||
}
|
||||
catch ( Exception exc )
|
||||
{
|
||||
// unable to parse file -- not WordToFoConverter fault
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -102,14 +109,74 @@ public class TestWordToConverterSuite
|
|||
Transformer transformer = TransformerFactory.newInstance()
|
||||
.newTransformer();
|
||||
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
|
||||
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
|
||||
transformer.setOutputProperty( OutputKeys.INDENT, "false" );
|
||||
transformer.transform(
|
||||
new DOMSource( wordToFoConverter.getDocument() ),
|
||||
new StreamResult( stringWriter ) );
|
||||
|
||||
if ( html )
|
||||
transformer.setOutputProperty( OutputKeys.METHOD, "html" );
|
||||
// no exceptions
|
||||
}
|
||||
|
||||
protected static void testHtml( File child ) throws Exception
|
||||
{
|
||||
HWPFDocumentCore hwpfDocument;
|
||||
try
|
||||
{
|
||||
hwpfDocument = AbstractWordUtils.loadDoc( child );
|
||||
}
|
||||
catch ( Exception exc )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
|
||||
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||
.newDocument() );
|
||||
wordToHtmlConverter.processDocument( hwpfDocument );
|
||||
|
||||
StringWriter stringWriter = new StringWriter();
|
||||
|
||||
Transformer transformer = TransformerFactory.newInstance()
|
||||
.newTransformer();
|
||||
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
|
||||
transformer.setOutputProperty( OutputKeys.INDENT, "false" );
|
||||
transformer.setOutputProperty( OutputKeys.METHOD, "html" );
|
||||
transformer.transform(
|
||||
new DOMSource( wordToHtmlConverter.getDocument() ),
|
||||
new StreamResult( stringWriter ) );
|
||||
|
||||
// no exceptions
|
||||
}
|
||||
|
||||
protected static void testText( File child ) throws Exception
|
||||
{
|
||||
HWPFDocumentCore wordDocument;
|
||||
try
|
||||
{
|
||||
wordDocument = AbstractWordUtils.loadDoc( child );
|
||||
}
|
||||
catch ( Exception exc )
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
WordToTextConverter wordToTextConverter = new WordToTextConverter(
|
||||
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||
.newDocument() );
|
||||
wordToTextConverter.processDocument( wordDocument );
|
||||
|
||||
StringWriter stringWriter = new StringWriter();
|
||||
|
||||
Transformer transformer = TransformerFactory.newInstance()
|
||||
.newTransformer();
|
||||
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
|
||||
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
|
||||
transformer.setOutputProperty( OutputKeys.METHOD, "text" );
|
||||
transformer.transform(
|
||||
new DOMSource( wordToTextConverter.getDocument() ),
|
||||
new StreamResult( stringWriter ) );
|
||||
|
||||
stringWriter.toString();
|
||||
// no exceptions
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue