mirror of https://github.com/apache/poi.git
add Word-to-Text converter
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1155281 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fa26e746cc
commit
888f51c566
|
@ -71,9 +71,9 @@ public abstract class AbstractWordConverter
|
||||||
|
|
||||||
private static final byte SPECCHAR_DRAWN_OBJECT = 8;
|
private static final byte SPECCHAR_DRAWN_OBJECT = 8;
|
||||||
|
|
||||||
private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
|
protected static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
|
||||||
|
|
||||||
private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
|
protected static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
|
||||||
|
|
||||||
private static void addToStructures( List<Structure> structures,
|
private static void addToStructures( List<Structure> structures,
|
||||||
Structure structure )
|
Structure structure )
|
||||||
|
@ -205,7 +205,7 @@ public abstract class AbstractWordConverter
|
||||||
Element currentBlock, Range range, int currentTableLevel,
|
Element currentBlock, Range range, int currentTableLevel,
|
||||||
List<Bookmark> rangeBookmarks );
|
List<Bookmark> rangeBookmarks );
|
||||||
|
|
||||||
protected boolean processCharacters( final HWPFDocumentCore document,
|
protected boolean processCharacters( final HWPFDocumentCore wordDocument,
|
||||||
final int currentTableLevel, final Range range, final Element block )
|
final int currentTableLevel, final Range range, final Element block )
|
||||||
{
|
{
|
||||||
if ( range == null )
|
if ( range == null )
|
||||||
|
@ -220,9 +220,9 @@ public abstract class AbstractWordConverter
|
||||||
* reconstruct the structure of range -- sergey
|
* reconstruct the structure of range -- sergey
|
||||||
*/
|
*/
|
||||||
List<Structure> structures = new LinkedList<Structure>();
|
List<Structure> structures = new LinkedList<Structure>();
|
||||||
if ( document instanceof HWPFDocument )
|
if ( wordDocument instanceof HWPFDocument )
|
||||||
{
|
{
|
||||||
final HWPFDocument doc = (HWPFDocument) document;
|
final HWPFDocument doc = (HWPFDocument) wordDocument;
|
||||||
|
|
||||||
Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks()
|
Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks()
|
||||||
.getBookmarksStartedBetween( range.getStartOffset(),
|
.getBookmarksStartedBetween( range.getStartOffset(),
|
||||||
|
@ -247,7 +247,7 @@ public abstract class AbstractWordConverter
|
||||||
CharacterRun characterRun = range.getCharacterRun( c );
|
CharacterRun characterRun = range.getCharacterRun( c );
|
||||||
if ( characterRun == null )
|
if ( characterRun == null )
|
||||||
throw new AssertionError();
|
throw new AssertionError();
|
||||||
Field aliveField = ( (HWPFDocument) document ).getFields()
|
Field aliveField = ( (HWPFDocument) wordDocument ).getFields()
|
||||||
.getFieldByStartOffset( FieldsDocumentPart.MAIN,
|
.getFieldByStartOffset( FieldsDocumentPart.MAIN,
|
||||||
characterRun.getStartOffset() );
|
characterRun.getStartOffset() );
|
||||||
if ( aliveField != null )
|
if ( aliveField != null )
|
||||||
|
@ -273,14 +273,15 @@ public abstract class AbstractWordConverter
|
||||||
return "BetweenStructuresSubrange " + super.toString();
|
return "BetweenStructuresSubrange " + super.toString();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
processCharacters( document, currentTableLevel, subrange, block );
|
processCharacters( wordDocument, currentTableLevel, subrange,
|
||||||
|
block );
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( structure.structure instanceof Bookmark )
|
if ( structure.structure instanceof Bookmark )
|
||||||
{
|
{
|
||||||
// other bookmarks with same bundaries
|
// other bookmarks with same bundaries
|
||||||
List<Bookmark> bookmarks = new LinkedList<Bookmark>();
|
List<Bookmark> bookmarks = new LinkedList<Bookmark>();
|
||||||
for ( Bookmark bookmark : ( (HWPFDocument) document )
|
for ( Bookmark bookmark : ( (HWPFDocument) wordDocument )
|
||||||
.getBookmarks()
|
.getBookmarks()
|
||||||
.getBookmarksStartedBetween( structure.start,
|
.getBookmarksStartedBetween( structure.start,
|
||||||
structure.start + 1 ).values().iterator()
|
structure.start + 1 ).values().iterator()
|
||||||
|
@ -306,7 +307,7 @@ public abstract class AbstractWordConverter
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
processBookmarks( document, block, subrange,
|
processBookmarks( wordDocument, block, subrange,
|
||||||
currentTableLevel, bookmarks );
|
currentTableLevel, bookmarks );
|
||||||
}
|
}
|
||||||
finally
|
finally
|
||||||
|
@ -317,7 +318,7 @@ public abstract class AbstractWordConverter
|
||||||
else if ( structure.structure instanceof Field )
|
else if ( structure.structure instanceof Field )
|
||||||
{
|
{
|
||||||
Field field = (Field) structure.structure;
|
Field field = (Field) structure.structure;
|
||||||
processField( (HWPFDocument) document, range,
|
processField( (HWPFDocument) wordDocument, range,
|
||||||
currentTableLevel, field, block );
|
currentTableLevel, field, block );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -349,7 +350,8 @@ public abstract class AbstractWordConverter
|
||||||
return "AfterStructureSubrange " + super.toString();
|
return "AfterStructureSubrange " + super.toString();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
processCharacters( document, currentTableLevel, subrange, block );
|
processCharacters( wordDocument, currentTableLevel, subrange,
|
||||||
|
block );
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -361,11 +363,11 @@ public abstract class AbstractWordConverter
|
||||||
if ( characterRun == null )
|
if ( characterRun == null )
|
||||||
throw new AssertionError();
|
throw new AssertionError();
|
||||||
|
|
||||||
if ( document instanceof HWPFDocument
|
if ( wordDocument instanceof HWPFDocument
|
||||||
&& ( (HWPFDocument) document ).getPicturesTable()
|
&& ( (HWPFDocument) wordDocument ).getPicturesTable()
|
||||||
.hasPicture( characterRun ) )
|
.hasPicture( characterRun ) )
|
||||||
{
|
{
|
||||||
HWPFDocument newFormat = (HWPFDocument) document;
|
HWPFDocument newFormat = (HWPFDocument) wordDocument;
|
||||||
Picture picture = newFormat.getPicturesTable().extractPicture(
|
Picture picture = newFormat.getPicturesTable().extractPicture(
|
||||||
characterRun, true );
|
characterRun, true );
|
||||||
|
|
||||||
|
@ -381,16 +383,16 @@ public abstract class AbstractWordConverter
|
||||||
if ( characterRun.isSpecialCharacter() )
|
if ( characterRun.isSpecialCharacter() )
|
||||||
{
|
{
|
||||||
if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE
|
if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE
|
||||||
&& ( document instanceof HWPFDocument ) )
|
&& ( wordDocument instanceof HWPFDocument ) )
|
||||||
{
|
{
|
||||||
HWPFDocument doc = (HWPFDocument) document;
|
HWPFDocument doc = (HWPFDocument) wordDocument;
|
||||||
processNoteAnchor( doc, characterRun, block );
|
processNoteAnchor( doc, characterRun, block );
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT
|
if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT
|
||||||
&& ( document instanceof HWPFDocument ) )
|
&& ( wordDocument instanceof HWPFDocument ) )
|
||||||
{
|
{
|
||||||
HWPFDocument doc = (HWPFDocument) document;
|
HWPFDocument doc = (HWPFDocument) wordDocument;
|
||||||
processDrawnObject( doc, characterRun, block );
|
processDrawnObject( doc, characterRun, block );
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -398,14 +400,15 @@ public abstract class AbstractWordConverter
|
||||||
|
|
||||||
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
|
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
|
||||||
{
|
{
|
||||||
if ( document instanceof HWPFDocument )
|
if ( wordDocument instanceof HWPFDocument )
|
||||||
{
|
{
|
||||||
Field aliveField = ( (HWPFDocument) document ).getFields()
|
Field aliveField = ( (HWPFDocument) wordDocument )
|
||||||
.getFieldByStartOffset( FieldsDocumentPart.MAIN,
|
.getFields().getFieldByStartOffset(
|
||||||
|
FieldsDocumentPart.MAIN,
|
||||||
characterRun.getStartOffset() );
|
characterRun.getStartOffset() );
|
||||||
if ( aliveField != null )
|
if ( aliveField != null )
|
||||||
{
|
{
|
||||||
processField( ( (HWPFDocument) document ), range,
|
processField( ( (HWPFDocument) wordDocument ), range,
|
||||||
currentTableLevel, aliveField, block );
|
currentTableLevel, aliveField, block );
|
||||||
|
|
||||||
int continueAfter = aliveField.getFieldEndOffset();
|
int continueAfter = aliveField.getFieldEndOffset();
|
||||||
|
@ -420,8 +423,8 @@ public abstract class AbstractWordConverter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int skipTo = tryDeadField( document, range, currentTableLevel,
|
int skipTo = tryDeadField( wordDocument, range,
|
||||||
c, block );
|
currentTableLevel, c, block );
|
||||||
|
|
||||||
if ( skipTo != c )
|
if ( skipTo != c )
|
||||||
{
|
{
|
||||||
|
@ -610,7 +613,7 @@ public abstract class AbstractWordConverter
|
||||||
CharacterRun characterRun, OfficeDrawing officeDrawing,
|
CharacterRun characterRun, OfficeDrawing officeDrawing,
|
||||||
String path, Element block );
|
String path, Element block );
|
||||||
|
|
||||||
protected abstract void processEndnoteAutonumbered( HWPFDocument doc,
|
protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
|
||||||
int noteIndex, Element block, Range endnoteTextRange );
|
int noteIndex, Element block, Range endnoteTextRange );
|
||||||
|
|
||||||
protected void processField( HWPFDocument hwpfDocument, Range parentRange,
|
protected void processField( HWPFDocument hwpfDocument, Range parentRange,
|
||||||
|
@ -666,7 +669,7 @@ public abstract class AbstractWordConverter
|
||||||
field.secondSubrange( parentRange ), currentBlock );
|
field.secondSubrange( parentRange ), currentBlock );
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract void processFootnoteAutonumbered( HWPFDocument doc,
|
protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||||
int noteIndex, Element block, Range footnoteTextRange );
|
int noteIndex, Element block, Range footnoteTextRange );
|
||||||
|
|
||||||
protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
|
protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
|
||||||
|
@ -734,8 +737,8 @@ public abstract class AbstractWordConverter
|
||||||
String pageref );
|
String pageref );
|
||||||
|
|
||||||
protected abstract void processParagraph( HWPFDocumentCore wordDocument,
|
protected abstract void processParagraph( HWPFDocumentCore wordDocument,
|
||||||
Element parentFopElement, int currentTableLevel,
|
Element parentElement, int currentTableLevel, Paragraph paragraph,
|
||||||
Paragraph paragraph, String bulletText );
|
String bulletText );
|
||||||
|
|
||||||
protected void processParagraphes( HWPFDocumentCore wordDocument,
|
protected void processParagraphes( HWPFDocumentCore wordDocument,
|
||||||
Element flow, Range range, int currentTableLevel )
|
Element flow, Range range, int currentTableLevel )
|
||||||
|
|
|
@ -0,0 +1,179 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hwpf.converter;
|
||||||
|
|
||||||
|
import org.apache.poi.util.Beta;
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
import org.w3c.dom.Text;
|
||||||
|
|
||||||
|
@Beta
|
||||||
|
public class TextDocumentFacade
|
||||||
|
{
|
||||||
|
protected final Element body;
|
||||||
|
protected final Document document;
|
||||||
|
protected final Element head;
|
||||||
|
protected final Element root;
|
||||||
|
|
||||||
|
protected Element title;
|
||||||
|
protected Text titleText;
|
||||||
|
|
||||||
|
public TextDocumentFacade( Document document )
|
||||||
|
{
|
||||||
|
this.document = document;
|
||||||
|
|
||||||
|
root = document.createElement( "html" );
|
||||||
|
document.appendChild( root );
|
||||||
|
|
||||||
|
body = document.createElement( "body" );
|
||||||
|
head = document.createElement( "head" );
|
||||||
|
|
||||||
|
root.appendChild( head );
|
||||||
|
root.appendChild( body );
|
||||||
|
|
||||||
|
title = document.createElement( "title" );
|
||||||
|
titleText = document.createTextNode( "" );
|
||||||
|
head.appendChild( title );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addAuthor( String value )
|
||||||
|
{
|
||||||
|
addMeta( "Author", value );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addDescription( String value )
|
||||||
|
{
|
||||||
|
addMeta( "Description", value );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addKeywords( String value )
|
||||||
|
{
|
||||||
|
addMeta( "Keywords", value );
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addMeta( final String name, String value )
|
||||||
|
{
|
||||||
|
Element meta = document.createElement( "meta" );
|
||||||
|
|
||||||
|
Element metaName = document.createElement( "name" );
|
||||||
|
metaName.appendChild( document.createTextNode( name + ": " ) );
|
||||||
|
meta.appendChild( metaName );
|
||||||
|
|
||||||
|
Element metaValue = document.createElement( "value" );
|
||||||
|
metaValue.appendChild( document.createTextNode( value + "\n" ) );
|
||||||
|
meta.appendChild( metaValue );
|
||||||
|
|
||||||
|
head.appendChild( meta );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createBlock()
|
||||||
|
{
|
||||||
|
return document.createElement( "div" );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createHeader1()
|
||||||
|
{
|
||||||
|
Element result = document.createElement( "h1" );
|
||||||
|
result.appendChild( document.createTextNode( " " ) );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createHeader2()
|
||||||
|
{
|
||||||
|
Element result = document.createElement( "h2" );
|
||||||
|
result.appendChild( document.createTextNode( " " ) );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createParagraph()
|
||||||
|
{
|
||||||
|
return document.createElement( "p" );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createTable()
|
||||||
|
{
|
||||||
|
return document.createElement( "table" );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createTableBody()
|
||||||
|
{
|
||||||
|
return document.createElement( "tbody" );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createTableCell()
|
||||||
|
{
|
||||||
|
return document.createElement( "td" );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createTableRow()
|
||||||
|
{
|
||||||
|
return document.createElement( "tr" );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Text createText( String data )
|
||||||
|
{
|
||||||
|
return document.createTextNode( data );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element createUnorderedList()
|
||||||
|
{
|
||||||
|
return document.createElement( "ul" );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element getBody()
|
||||||
|
{
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document getDocument()
|
||||||
|
{
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Element getHead()
|
||||||
|
{
|
||||||
|
return head;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTitle()
|
||||||
|
{
|
||||||
|
if ( title == null )
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return titleText.getTextContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTitle( String titleText )
|
||||||
|
{
|
||||||
|
if ( WordToHtmlUtils.isEmpty( titleText ) && this.title != null )
|
||||||
|
{
|
||||||
|
this.head.removeChild( this.title );
|
||||||
|
this.title = null;
|
||||||
|
this.titleText = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( this.title == null )
|
||||||
|
{
|
||||||
|
this.title = document.createElement( "title" );
|
||||||
|
this.titleText = document.createTextNode( titleText );
|
||||||
|
this.title.appendChild( this.titleText );
|
||||||
|
this.head.appendChild( title );
|
||||||
|
}
|
||||||
|
|
||||||
|
this.titleText.setData( titleText );
|
||||||
|
}
|
||||||
|
}
|
|
@ -276,7 +276,7 @@ public class WordToFoConverter extends AbstractWordConverter
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex,
|
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
|
||||||
Element block, Range endnoteTextRange )
|
Element block, Range endnoteTextRange )
|
||||||
{
|
{
|
||||||
final String textIndex = String.valueOf( internalLinkCounter
|
final String textIndex = String.valueOf( internalLinkCounter
|
||||||
|
@ -297,14 +297,14 @@ public class WordToFoConverter extends AbstractWordConverter
|
||||||
setId( backwardLink, forwardLinkName );
|
setId( backwardLink, forwardLinkName );
|
||||||
endnote.appendChild( backwardLink );
|
endnote.appendChild( backwardLink );
|
||||||
|
|
||||||
processCharacters( doc, Integer.MIN_VALUE, endnoteTextRange, endnote );
|
processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
|
||||||
|
|
||||||
WordToFoUtils.compactInlines( endnote );
|
WordToFoUtils.compactInlines( endnote );
|
||||||
this.endnotes.add( endnote );
|
this.endnotes.add( endnote );
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void processFootnoteAutonumbered( HWPFDocument doc,
|
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||||
int noteIndex, Element block, Range footnoteTextRange )
|
int noteIndex, Element block, Range footnoteTextRange )
|
||||||
{
|
{
|
||||||
final String textIndex = String.valueOf( internalLinkCounter
|
final String textIndex = String.valueOf( internalLinkCounter
|
||||||
|
@ -333,7 +333,7 @@ public class WordToFoConverter extends AbstractWordConverter
|
||||||
footnoteBody.appendChild( footnoteBlock );
|
footnoteBody.appendChild( footnoteBlock );
|
||||||
footNote.appendChild( footnoteBody );
|
footNote.appendChild( footnoteBody );
|
||||||
|
|
||||||
processCharacters( doc, Integer.MIN_VALUE, footnoteTextRange,
|
processCharacters( wordDocument, Integer.MIN_VALUE, footnoteTextRange,
|
||||||
footnoteBlock );
|
footnoteBlock );
|
||||||
|
|
||||||
WordToFoUtils.compactInlines( footnoteBlock );
|
WordToFoUtils.compactInlines( footnoteBlock );
|
||||||
|
|
|
@ -282,17 +282,17 @@ public class WordToHtmlConverter extends AbstractWordConverter
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void processEndnoteAutonumbered( HWPFDocument doc, int noteIndex,
|
protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
|
||||||
Element block, Range endnoteTextRange )
|
Element block, Range endnoteTextRange )
|
||||||
{
|
{
|
||||||
processNoteAutonumbered( doc, "end", noteIndex, block, endnoteTextRange );
|
processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void processFootnoteAutonumbered( HWPFDocument doc,
|
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||||
int noteIndex, Element block, Range footnoteTextRange )
|
int noteIndex, Element block, Range footnoteTextRange )
|
||||||
{
|
{
|
||||||
processNoteAutonumbered( doc, "foot", noteIndex, block,
|
processNoteAutonumbered( wordDocument, "foot", noteIndex, block,
|
||||||
footnoteTextRange );
|
footnoteTextRange );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -508,11 +508,11 @@ public class WordToHtmlConverter extends AbstractWordConverter
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void processParagraph( HWPFDocumentCore hwpfDocument,
|
protected void processParagraph( HWPFDocumentCore hwpfDocument,
|
||||||
Element parentFopElement, int currentTableLevel,
|
Element parentElement, int currentTableLevel, Paragraph paragraph,
|
||||||
Paragraph paragraph, String bulletText )
|
String bulletText )
|
||||||
{
|
{
|
||||||
final Element pElement = htmlDocumentFacade.createParagraph();
|
final Element pElement = htmlDocumentFacade.createParagraph();
|
||||||
parentFopElement.appendChild( pElement );
|
parentElement.appendChild( pElement );
|
||||||
|
|
||||||
StringBuilder style = new StringBuilder();
|
StringBuilder style = new StringBuilder();
|
||||||
WordToHtmlUtils.addParagraphProperties( paragraph, style );
|
WordToHtmlUtils.addParagraphProperties( paragraph, style );
|
||||||
|
|
|
@ -0,0 +1,288 @@
|
||||||
|
package org.apache.poi.hwpf.converter;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
|
import javax.xml.transform.OutputKeys;
|
||||||
|
import javax.xml.transform.Transformer;
|
||||||
|
import javax.xml.transform.TransformerFactory;
|
||||||
|
import javax.xml.transform.dom.DOMSource;
|
||||||
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
|
||||||
|
import org.apache.poi.hpsf.SummaryInformation;
|
||||||
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFDocumentCore;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Bookmark;
|
||||||
|
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
||||||
|
import org.apache.poi.hwpf.usermodel.OfficeDrawing;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Picture;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Section;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Table;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TableCell;
|
||||||
|
import org.apache.poi.hwpf.usermodel.TableRow;
|
||||||
|
import org.apache.poi.util.Beta;
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
|
||||||
|
@Beta
|
||||||
|
public class WordToTextConverter extends AbstractWordConverter
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Java main() interface to interact with {@link WordToTextConverter}
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Usage: WordToTextConverter infile outfile
|
||||||
|
* </p>
|
||||||
|
* Where infile is an input .doc file ( Word 95-2007) which will be rendered
|
||||||
|
* as plain text into outfile
|
||||||
|
*/
|
||||||
|
public static void main( String[] args )
|
||||||
|
{
|
||||||
|
if ( args.length < 2 )
|
||||||
|
{
|
||||||
|
System.err
|
||||||
|
.println( "Usage: WordToTextConverter <inputFile.doc> <saveTo.txt>" );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println( "Converting " + args[0] );
|
||||||
|
System.out.println( "Saving output to " + args[1] );
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Document doc = WordToTextConverter.process( new File( args[0] ) );
|
||||||
|
|
||||||
|
FileWriter out = new FileWriter( args[1] );
|
||||||
|
DOMSource domSource = new DOMSource( doc );
|
||||||
|
StreamResult streamResult = new StreamResult( out );
|
||||||
|
|
||||||
|
TransformerFactory tf = TransformerFactory.newInstance();
|
||||||
|
Transformer serializer = tf.newTransformer();
|
||||||
|
// TODO set encoding from a command argument
|
||||||
|
serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
|
||||||
|
serializer.setOutputProperty( OutputKeys.INDENT, "no" );
|
||||||
|
serializer.setOutputProperty( OutputKeys.METHOD, "text" );
|
||||||
|
serializer.transform( domSource, streamResult );
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
catch ( Exception e )
|
||||||
|
{
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static Document process( File docFile ) throws Exception
|
||||||
|
{
|
||||||
|
final HWPFDocumentCore wordDocument = AbstractWordUtils
|
||||||
|
.loadDoc( docFile );
|
||||||
|
WordToTextConverter wordToTextConverter = new WordToTextConverter(
|
||||||
|
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||||
|
.newDocument() );
|
||||||
|
wordToTextConverter.processDocument( wordDocument );
|
||||||
|
return wordToTextConverter.getDocument();
|
||||||
|
}
|
||||||
|
|
||||||
|
private AtomicInteger noteCounters = new AtomicInteger( 1 );
|
||||||
|
|
||||||
|
private Element notes = null;
|
||||||
|
|
||||||
|
private final TextDocumentFacade textDocumentFacade;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates new instance of {@link WordToTextConverter}. Can be used for
|
||||||
|
* output several {@link HWPFDocument}s into single text document.
|
||||||
|
*
|
||||||
|
* @param document
|
||||||
|
* XML DOM Document used as storage for text pieces
|
||||||
|
*/
|
||||||
|
public WordToTextConverter( Document document )
|
||||||
|
{
|
||||||
|
this.textDocumentFacade = new TextDocumentFacade( document );
|
||||||
|
}
|
||||||
|
|
||||||
|
public Document getDocument()
|
||||||
|
{
|
||||||
|
return textDocumentFacade.getDocument();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void outputCharacters( Element block, CharacterRun characterRun,
|
||||||
|
String text )
|
||||||
|
{
|
||||||
|
block.appendChild( textDocumentFacade.createText( text ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processBookmarks( HWPFDocumentCore wordDocument,
|
||||||
|
Element currentBlock, Range range, int currentTableLevel,
|
||||||
|
List<Bookmark> rangeBookmarks )
|
||||||
|
{
|
||||||
|
processCharacters( wordDocument, currentTableLevel, range, currentBlock );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void processDocument( HWPFDocumentCore wordDocument )
|
||||||
|
{
|
||||||
|
super.processDocument( wordDocument );
|
||||||
|
|
||||||
|
if ( notes != null )
|
||||||
|
textDocumentFacade.getBody().appendChild( notes );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processDocumentInformation(
|
||||||
|
SummaryInformation summaryInformation )
|
||||||
|
{
|
||||||
|
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
|
||||||
|
textDocumentFacade.setTitle( summaryInformation.getTitle() );
|
||||||
|
|
||||||
|
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
|
||||||
|
textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
|
||||||
|
|
||||||
|
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
|
||||||
|
textDocumentFacade
|
||||||
|
.addDescription( summaryInformation.getComments() );
|
||||||
|
|
||||||
|
if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
|
||||||
|
textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processDrawnObject( HWPFDocument doc,
|
||||||
|
CharacterRun characterRun, OfficeDrawing officeDrawing,
|
||||||
|
String path, Element block )
|
||||||
|
{
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
|
||||||
|
int noteIndex, Element block, Range endnoteTextRange )
|
||||||
|
{
|
||||||
|
processNote( wordDocument, block, endnoteTextRange );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processFootnoteAutonumbered( HWPFDocument wordDocument,
|
||||||
|
int noteIndex, Element block, Range footnoteTextRange )
|
||||||
|
{
|
||||||
|
processNote( wordDocument, block, footnoteTextRange );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processHyperlink( HWPFDocumentCore wordDocument,
|
||||||
|
Element currentBlock, Range textRange, int currentTableLevel,
|
||||||
|
String hyperlink )
|
||||||
|
{
|
||||||
|
processCharacters( wordDocument, currentTableLevel, textRange,
|
||||||
|
currentBlock );
|
||||||
|
|
||||||
|
currentBlock.appendChild( textDocumentFacade.createText( " ("
|
||||||
|
+ UNICODECHAR_ZERO_WIDTH_SPACE
|
||||||
|
+ hyperlink.replaceAll( "\\/", UNICODECHAR_ZERO_WIDTH_SPACE
|
||||||
|
+ "\\/" + UNICODECHAR_ZERO_WIDTH_SPACE )
|
||||||
|
+ UNICODECHAR_ZERO_WIDTH_SPACE + ")" ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processImage( Element currentBlock, boolean inlined,
|
||||||
|
Picture picture )
|
||||||
|
{
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processLineBreak( Element block, CharacterRun characterRun )
|
||||||
|
{
|
||||||
|
block.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void processNote( HWPFDocument wordDocument, Element block,
|
||||||
|
Range noteTextRange )
|
||||||
|
{
|
||||||
|
final int noteIndex = noteCounters.getAndIncrement();
|
||||||
|
block.appendChild( textDocumentFacade
|
||||||
|
.createText( UNICODECHAR_ZERO_WIDTH_SPACE + "[" + noteIndex
|
||||||
|
+ "]" + UNICODECHAR_ZERO_WIDTH_SPACE ) );
|
||||||
|
|
||||||
|
if ( notes == null )
|
||||||
|
notes = textDocumentFacade.createBlock();
|
||||||
|
|
||||||
|
Element note = textDocumentFacade.createBlock();
|
||||||
|
notes.appendChild( note );
|
||||||
|
|
||||||
|
note.appendChild( textDocumentFacade.createText( "^" + noteIndex
|
||||||
|
+ "\t " ) );
|
||||||
|
processCharacters( wordDocument, Integer.MIN_VALUE, noteTextRange, note );
|
||||||
|
note.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processPageref( HWPFDocumentCore wordDocument,
|
||||||
|
Element currentBlock, Range textRange, int currentTableLevel,
|
||||||
|
String pageref )
|
||||||
|
{
|
||||||
|
processCharacters( wordDocument, currentTableLevel, textRange,
|
||||||
|
currentBlock );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processParagraph( HWPFDocumentCore wordDocument,
|
||||||
|
Element parentElement, int currentTableLevel, Paragraph paragraph,
|
||||||
|
String bulletText )
|
||||||
|
{
|
||||||
|
Element pElement = textDocumentFacade.createParagraph();
|
||||||
|
pElement.appendChild( textDocumentFacade.createText( bulletText ) );
|
||||||
|
processCharacters( wordDocument, currentTableLevel, paragraph, pElement );
|
||||||
|
pElement.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||||
|
parentElement.appendChild( pElement );
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void processSection( HWPFDocumentCore wordDocument,
|
||||||
|
Section section, int s )
|
||||||
|
{
|
||||||
|
Element sectionElement = textDocumentFacade.createBlock();
|
||||||
|
processParagraphes( wordDocument, sectionElement, section,
|
||||||
|
Integer.MIN_VALUE );
|
||||||
|
sectionElement.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||||
|
textDocumentFacade.body.appendChild( sectionElement );
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
|
||||||
|
Table table )
|
||||||
|
{
|
||||||
|
final int tableRows = table.numRows();
|
||||||
|
for ( int r = 0; r < tableRows; r++ )
|
||||||
|
{
|
||||||
|
TableRow tableRow = table.getRow( r );
|
||||||
|
|
||||||
|
Element tableRowElement = textDocumentFacade.createTableRow();
|
||||||
|
|
||||||
|
final int rowCells = tableRow.numCells();
|
||||||
|
for ( int c = 0; c < rowCells; c++ )
|
||||||
|
{
|
||||||
|
TableCell tableCell = tableRow.getCell( c );
|
||||||
|
|
||||||
|
Element tableCellElement = textDocumentFacade.createTableCell();
|
||||||
|
|
||||||
|
if ( c != 0 )
|
||||||
|
tableCellElement.appendChild( textDocumentFacade
|
||||||
|
.createText( "\t" ) );
|
||||||
|
|
||||||
|
processParagraphes( hwpfDocument, tableCellElement, tableCell,
|
||||||
|
table.getTableLevel() );
|
||||||
|
tableRowElement.appendChild( tableCellElement );
|
||||||
|
}
|
||||||
|
|
||||||
|
tableRowElement.appendChild( textDocumentFacade.createText( "\n" ) );
|
||||||
|
flow.appendChild( tableRowElement );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -45,7 +45,8 @@ public class TestWordToConverterSuite
|
||||||
|
|
||||||
public static Test suite()
|
public static Test suite()
|
||||||
{
|
{
|
||||||
TestSuite suite = new TestSuite(TestWordToConverterSuite.class.getName());
|
TestSuite suite = new TestSuite(
|
||||||
|
TestWordToConverterSuite.class.getName() );
|
||||||
|
|
||||||
File directory = POIDataSamples.getDocumentInstance().getFile(
|
File directory = POIDataSamples.getDocumentInstance().getFile(
|
||||||
"../document" );
|
"../document" );
|
||||||
|
@ -63,14 +64,21 @@ public class TestWordToConverterSuite
|
||||||
{
|
{
|
||||||
public void runTest() throws Exception
|
public void runTest() throws Exception
|
||||||
{
|
{
|
||||||
test( child, false );
|
testFo( child );
|
||||||
}
|
}
|
||||||
} );
|
} );
|
||||||
suite.addTest( new TestCase( name + " [HTML]" )
|
suite.addTest( new TestCase( name + " [HTML]" )
|
||||||
{
|
{
|
||||||
public void runTest() throws Exception
|
public void runTest() throws Exception
|
||||||
{
|
{
|
||||||
test( child, true );
|
testHtml( child );
|
||||||
|
}
|
||||||
|
} );
|
||||||
|
suite.addTest( new TestCase( name + " [TEXT]" )
|
||||||
|
{
|
||||||
|
public void runTest() throws Exception
|
||||||
|
{
|
||||||
|
testText( child );
|
||||||
}
|
}
|
||||||
} );
|
} );
|
||||||
|
|
||||||
|
@ -79,7 +87,7 @@ public class TestWordToConverterSuite
|
||||||
return suite;
|
return suite;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static void test( File child, boolean html ) throws Exception
|
protected static void testFo( File child ) throws Exception
|
||||||
{
|
{
|
||||||
HWPFDocumentCore hwpfDocument;
|
HWPFDocumentCore hwpfDocument;
|
||||||
try
|
try
|
||||||
|
@ -88,7 +96,6 @@ public class TestWordToConverterSuite
|
||||||
}
|
}
|
||||||
catch ( Exception exc )
|
catch ( Exception exc )
|
||||||
{
|
{
|
||||||
// unable to parse file -- not WordToFoConverter fault
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,14 +109,74 @@ public class TestWordToConverterSuite
|
||||||
Transformer transformer = TransformerFactory.newInstance()
|
Transformer transformer = TransformerFactory.newInstance()
|
||||||
.newTransformer();
|
.newTransformer();
|
||||||
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
|
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
|
||||||
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
|
transformer.setOutputProperty( OutputKeys.INDENT, "false" );
|
||||||
transformer.transform(
|
transformer.transform(
|
||||||
new DOMSource( wordToFoConverter.getDocument() ),
|
new DOMSource( wordToFoConverter.getDocument() ),
|
||||||
new StreamResult( stringWriter ) );
|
new StreamResult( stringWriter ) );
|
||||||
|
|
||||||
if ( html )
|
// no exceptions
|
||||||
transformer.setOutputProperty( OutputKeys.METHOD, "html" );
|
}
|
||||||
|
|
||||||
|
protected static void testHtml( File child ) throws Exception
|
||||||
|
{
|
||||||
|
HWPFDocumentCore hwpfDocument;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
hwpfDocument = AbstractWordUtils.loadDoc( child );
|
||||||
|
}
|
||||||
|
catch ( Exception exc )
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
|
||||||
|
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||||
|
.newDocument() );
|
||||||
|
wordToHtmlConverter.processDocument( hwpfDocument );
|
||||||
|
|
||||||
|
StringWriter stringWriter = new StringWriter();
|
||||||
|
|
||||||
|
Transformer transformer = TransformerFactory.newInstance()
|
||||||
|
.newTransformer();
|
||||||
|
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
|
||||||
|
transformer.setOutputProperty( OutputKeys.INDENT, "false" );
|
||||||
|
transformer.setOutputProperty( OutputKeys.METHOD, "html" );
|
||||||
|
transformer.transform(
|
||||||
|
new DOMSource( wordToHtmlConverter.getDocument() ),
|
||||||
|
new StreamResult( stringWriter ) );
|
||||||
|
|
||||||
// no exceptions
|
// no exceptions
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected static void testText( File child ) throws Exception
|
||||||
|
{
|
||||||
|
HWPFDocumentCore wordDocument;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
wordDocument = AbstractWordUtils.loadDoc( child );
|
||||||
|
}
|
||||||
|
catch ( Exception exc )
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
WordToTextConverter wordToTextConverter = new WordToTextConverter(
|
||||||
|
DocumentBuilderFactory.newInstance().newDocumentBuilder()
|
||||||
|
.newDocument() );
|
||||||
|
wordToTextConverter.processDocument( wordDocument );
|
||||||
|
|
||||||
|
StringWriter stringWriter = new StringWriter();
|
||||||
|
|
||||||
|
Transformer transformer = TransformerFactory.newInstance()
|
||||||
|
.newTransformer();
|
||||||
|
transformer.setOutputProperty( OutputKeys.ENCODING, "utf-8" );
|
||||||
|
transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
|
||||||
|
transformer.setOutputProperty( OutputKeys.METHOD, "text" );
|
||||||
|
transformer.transform(
|
||||||
|
new DOMSource( wordToTextConverter.getDocument() ),
|
||||||
|
new StreamResult( stringWriter ) );
|
||||||
|
|
||||||
|
stringWriter.toString();
|
||||||
|
// no exceptions
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue