mirror of https://github.com/apache/poi.git
Basic text extraction support for old Word 6 and Word 95 documents via some HWPF extensions
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959346 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7ae1a20f07
commit
999aecbaa1
|
@ -17,26 +17,43 @@
|
||||||
|
|
||||||
package org.apache.poi.hwpf;
|
package org.apache.poi.hwpf;
|
||||||
|
|
||||||
import java.io.InputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.PushbackInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
import org.apache.poi.EncryptedDocumentException;
|
import org.apache.poi.hwpf.model.CHPBinTable;
|
||||||
import org.apache.poi.POIDocument;
|
import org.apache.poi.hwpf.model.CPSplitCalculator;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.hwpf.model.DocumentProperties;
|
||||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
import org.apache.poi.hwpf.model.EscherRecordHolder;
|
||||||
|
import org.apache.poi.hwpf.model.FSPATable;
|
||||||
|
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||||
|
import org.apache.poi.hwpf.model.FontTable;
|
||||||
|
import org.apache.poi.hwpf.model.GenericPropertyNode;
|
||||||
|
import org.apache.poi.hwpf.model.ListTables;
|
||||||
|
import org.apache.poi.hwpf.model.PAPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.PicturesTable;
|
||||||
|
import org.apache.poi.hwpf.model.PlexOfCps;
|
||||||
|
import org.apache.poi.hwpf.model.PropertyNode;
|
||||||
|
import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
|
||||||
|
import org.apache.poi.hwpf.model.SavedByTable;
|
||||||
|
import org.apache.poi.hwpf.model.SectionTable;
|
||||||
|
import org.apache.poi.hwpf.model.ShapesTable;
|
||||||
|
import org.apache.poi.hwpf.model.StyleSheet;
|
||||||
|
import org.apache.poi.hwpf.model.TextPiece;
|
||||||
|
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||||
|
import org.apache.poi.hwpf.model.io.HWPFFileSystem;
|
||||||
|
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
||||||
|
import org.apache.poi.hwpf.usermodel.HWPFList;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
import org.apache.poi.poifs.common.POIFSConstants;
|
import org.apache.poi.poifs.common.POIFSConstants;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.hwpf.model.*;
|
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||||
import org.apache.poi.hwpf.model.io.*;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.apache.poi.hwpf.usermodel.*;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*;
|
||||||
*
|
*
|
||||||
* @author Ryan Ackley
|
* @author Ryan Ackley
|
||||||
*/
|
*/
|
||||||
public final class HWPFDocument extends POIDocument
|
public final class HWPFDocument extends HWPFDocumentCore
|
||||||
// implements Cloneable
|
|
||||||
{
|
{
|
||||||
/** The FIB */
|
|
||||||
protected FileInformationBlock _fib;
|
|
||||||
/** And for making sense of CP lengths in the FIB */
|
/** And for making sense of CP lengths in the FIB */
|
||||||
protected CPSplitCalculator _cpSplit;
|
protected CPSplitCalculator _cpSplit;
|
||||||
|
|
||||||
/** main document stream buffer*/
|
|
||||||
protected byte[] _mainStream;
|
|
||||||
|
|
||||||
/** table stream buffer*/
|
/** table stream buffer*/
|
||||||
protected byte[] _tableStream;
|
protected byte[] _tableStream;
|
||||||
|
|
||||||
|
@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument
|
||||||
|
|
||||||
protected HWPFDocument()
|
protected HWPFDocument()
|
||||||
{
|
{
|
||||||
super(null, null);
|
super();
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Takens an InputStream, verifies that it's not RTF, builds a
|
|
||||||
* POIFSFileSystem from it, and returns that.
|
|
||||||
*/
|
|
||||||
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
|
|
||||||
// Open a PushbackInputStream, so we can peek at the first few bytes
|
|
||||||
PushbackInputStream pis = new PushbackInputStream(istream,6);
|
|
||||||
byte[] first6 = new byte[6];
|
|
||||||
pis.read(first6);
|
|
||||||
|
|
||||||
// Does it start with {\rtf ? If so, it's really RTF
|
|
||||||
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
|
|
||||||
&& first6[3] == 't' && first6[4] == 'f') {
|
|
||||||
throw new IllegalArgumentException("The document is really a RTF file");
|
|
||||||
}
|
|
||||||
|
|
||||||
// OK, so it's not RTF
|
|
||||||
// Open a POIFSFileSystem on the (pushed back) stream
|
|
||||||
pis.unread(first6);
|
|
||||||
return new POIFSFileSystem(pis);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument
|
||||||
*/
|
*/
|
||||||
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
|
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
|
||||||
{
|
{
|
||||||
// Sort out the hpsf properties
|
// Load the main stream and FIB
|
||||||
|
// Also handles HPSF bits
|
||||||
super(directory, pfilesystem);
|
super(directory, pfilesystem);
|
||||||
|
|
||||||
// read in the main stream.
|
// Do the CP Split
|
||||||
DocumentEntry documentProps = (DocumentEntry)
|
|
||||||
directory.getEntry("WordDocument");
|
|
||||||
_mainStream = new byte[documentProps.getSize()];
|
|
||||||
|
|
||||||
directory.createDocumentInputStream("WordDocument").read(_mainStream);
|
|
||||||
|
|
||||||
// Create our FIB, and check for the doc being encrypted
|
|
||||||
_fib = new FileInformationBlock(_mainStream);
|
|
||||||
_cpSplit = new CPSplitCalculator(_fib);
|
_cpSplit = new CPSplitCalculator(_fib);
|
||||||
if(_fib.isFEncrypted()) {
|
|
||||||
throw new EncryptedDocumentException("Cannot process encrypted word files!");
|
// Is this document too old for us?
|
||||||
|
if(_fib.getNFib() < 106) {
|
||||||
|
throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
|
||||||
}
|
}
|
||||||
|
|
||||||
// use the fib to determine the name of the table stream.
|
// use the fib to determine the name of the table stream.
|
||||||
|
@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument
|
||||||
t.printStackTrace();
|
t.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// public Object clone()
|
|
||||||
// throws CloneNotSupportedException
|
|
||||||
// {
|
|
||||||
// _tpt;
|
|
||||||
//
|
|
||||||
// _cbt;
|
|
||||||
//
|
|
||||||
// _pbt;
|
|
||||||
//
|
|
||||||
// _st;
|
|
||||||
//
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.PushbackInputStream;
|
||||||
|
|
||||||
|
import org.apache.poi.EncryptedDocumentException;
|
||||||
|
import org.apache.poi.POIDocument;
|
||||||
|
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class holds much of the core of a Word document, but
|
||||||
|
* without some of the table structure information.
|
||||||
|
* You generally want to work with one of
|
||||||
|
* {@link HWPFDocument} or {@link HWPFOldDocument}
|
||||||
|
*/
|
||||||
|
public abstract class HWPFDocumentCore extends POIDocument
|
||||||
|
{
|
||||||
|
/** The FIB */
|
||||||
|
protected FileInformationBlock _fib;
|
||||||
|
|
||||||
|
/** main document stream buffer*/
|
||||||
|
protected byte[] _mainStream;
|
||||||
|
|
||||||
|
protected HWPFDocumentCore()
|
||||||
|
{
|
||||||
|
super(null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takens an InputStream, verifies that it's not RTF, builds a
|
||||||
|
* POIFSFileSystem from it, and returns that.
|
||||||
|
*/
|
||||||
|
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
|
||||||
|
// Open a PushbackInputStream, so we can peek at the first few bytes
|
||||||
|
PushbackInputStream pis = new PushbackInputStream(istream,6);
|
||||||
|
byte[] first6 = new byte[6];
|
||||||
|
pis.read(first6);
|
||||||
|
|
||||||
|
// Does it start with {\rtf ? If so, it's really RTF
|
||||||
|
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
|
||||||
|
&& first6[3] == 't' && first6[4] == 'f') {
|
||||||
|
throw new IllegalArgumentException("The document is really a RTF file");
|
||||||
|
}
|
||||||
|
|
||||||
|
// OK, so it's not RTF
|
||||||
|
// Open a POIFSFileSystem on the (pushed back) stream
|
||||||
|
pis.unread(first6);
|
||||||
|
return new POIFSFileSystem(pis);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This constructor loads a Word document from an InputStream.
|
||||||
|
*
|
||||||
|
* @param istream The InputStream that contains the Word document.
|
||||||
|
* @throws IOException If there is an unexpected IOException from the passed
|
||||||
|
* in InputStream.
|
||||||
|
*/
|
||||||
|
public HWPFDocumentCore(InputStream istream) throws IOException
|
||||||
|
{
|
||||||
|
//do Ole stuff
|
||||||
|
this( verifyAndBuildPOIFS(istream) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This constructor loads a Word document from a POIFSFileSystem
|
||||||
|
*
|
||||||
|
* @param pfilesystem The POIFSFileSystem that contains the Word document.
|
||||||
|
* @throws IOException If there is an unexpected IOException from the passed
|
||||||
|
* in POIFSFileSystem.
|
||||||
|
*/
|
||||||
|
public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException
|
||||||
|
{
|
||||||
|
this(pfilesystem.getRoot(), pfilesystem);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This constructor loads a Word document from a specific point
|
||||||
|
* in a POIFSFileSystem, probably not the default.
|
||||||
|
* Used typically to open embeded documents.
|
||||||
|
*
|
||||||
|
* @param pfilesystem The POIFSFileSystem that contains the Word document.
|
||||||
|
* @throws IOException If there is an unexpected IOException from the passed
|
||||||
|
* in POIFSFileSystem.
|
||||||
|
*/
|
||||||
|
public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
|
||||||
|
{
|
||||||
|
// Sort out the hpsf properties
|
||||||
|
super(directory, pfilesystem);
|
||||||
|
|
||||||
|
// read in the main stream.
|
||||||
|
DocumentEntry documentProps = (DocumentEntry)
|
||||||
|
directory.getEntry("WordDocument");
|
||||||
|
_mainStream = new byte[documentProps.getSize()];
|
||||||
|
|
||||||
|
directory.createDocumentInputStream("WordDocument").read(_mainStream);
|
||||||
|
|
||||||
|
// Create our FIB, and check for the doc being encrypted
|
||||||
|
_fib = new FileInformationBlock(_mainStream);
|
||||||
|
if(_fib.isFEncrypted()) {
|
||||||
|
throw new EncryptedDocumentException("Cannot process encrypted word files!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileInformationBlock getFileInformationBlock()
|
||||||
|
{
|
||||||
|
return _fib;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,135 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hwpf;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.poi.hwpf.model.CHPX;
|
||||||
|
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||||
|
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
||||||
|
import org.apache.poi.hwpf.model.PieceDescriptor;
|
||||||
|
import org.apache.poi.hwpf.model.TextPiece;
|
||||||
|
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides very simple support for old (Word 6 / Word 95)
|
||||||
|
* files.
|
||||||
|
* TODO Provide a way to get at the properties associated
|
||||||
|
* with each block of text
|
||||||
|
*/
|
||||||
|
public class HWPFOldDocument extends HWPFDocumentCore {
|
||||||
|
private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
|
||||||
|
|
||||||
|
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
||||||
|
this(fs.getRoot(), fs);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
|
||||||
|
throws IOException {
|
||||||
|
super(directory, fs);
|
||||||
|
|
||||||
|
// Where are things?
|
||||||
|
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
|
||||||
|
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
||||||
|
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
||||||
|
|
||||||
|
// We need to get hold of the text that makes up the
|
||||||
|
// document, which might be regular or fast-saved
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
TextPieceTable tpt;
|
||||||
|
if(_fib.isFComplex()) {
|
||||||
|
ComplexFileTable cft = new ComplexFileTable(
|
||||||
|
_mainStream, _mainStream,
|
||||||
|
complexTableOffset, _fib.getFcMin()
|
||||||
|
);
|
||||||
|
tpt = cft.getTextPieceTable();
|
||||||
|
|
||||||
|
for(TextPiece tp : tpt.getTextPieces()) {
|
||||||
|
text.append( tp.getStringBuffer() );
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// TODO Build the Piece Descriptor properly
|
||||||
|
// TODO Can these old documents ever contain Unicode strings?
|
||||||
|
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
|
||||||
|
pd.setFilePosition(_fib.getFcMin());
|
||||||
|
|
||||||
|
tpt = new TextPieceTable();
|
||||||
|
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
|
||||||
|
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
|
||||||
|
TextPiece tp = new TextPiece(
|
||||||
|
0, textData.length, textData, pd, 0
|
||||||
|
);
|
||||||
|
tpt.getTextPieces().add(tp);
|
||||||
|
|
||||||
|
text.append(tp.getStringBuffer());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we can fetch the character and paragraph properties
|
||||||
|
OldCHPBinTable chpTable = new OldCHPBinTable(
|
||||||
|
_mainStream, chpTableOffset, chpTableSize,
|
||||||
|
_fib.getFcMin(), tpt
|
||||||
|
);
|
||||||
|
|
||||||
|
// Finally build up runs
|
||||||
|
for(CHPX chpx : chpTable.getTextRuns()) {
|
||||||
|
String str = text.substring(chpx.getStart(), chpx.getEnd());
|
||||||
|
contents.add(new TextAndCHPX(str,chpx));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(OutputStream out) throws IOException {
|
||||||
|
throw new IllegalStateException("Writing is not available for the older file formats");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves all our text, in order, along with the
|
||||||
|
* CHPX information on each bit.
|
||||||
|
* Every entry has the same formatting, but as yet
|
||||||
|
* we've no way to tell what the formatting is...
|
||||||
|
* Warnings - this will change as soon as we support
|
||||||
|
* text formatting!
|
||||||
|
*/
|
||||||
|
public List<TextAndCHPX> getContents() {
|
||||||
|
return contents;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Warnings - this will change as soon as we support
|
||||||
|
* text formatting!
|
||||||
|
*/
|
||||||
|
public static class TextAndCHPX {
|
||||||
|
private String text;
|
||||||
|
private CHPX chpx;
|
||||||
|
private TextAndCHPX(String text, CHPX chpx) {
|
||||||
|
this.text = text;
|
||||||
|
this.chpx = chpx;
|
||||||
|
}
|
||||||
|
public String getText() {
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
public CHPX getChpx() {
|
||||||
|
return chpx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
package org.apache.poi.hwpf;
|
||||||
|
|
||||||
|
import org.apache.poi.OldFileFormatException;
|
||||||
|
|
||||||
|
public class OldWordFileFormatException extends OldFileFormatException {
|
||||||
|
public OldWordFileFormatException(String s) {
|
||||||
|
super(s);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,79 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
|
||||||
|
import org.apache.poi.POIOLE2TextExtractor;
|
||||||
|
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||||
|
import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
|
||||||
|
import org.apache.poi.hwpf.usermodel.Range;
|
||||||
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to extract the text from old (Word 6 / Word 95) Word Documents.
|
||||||
|
*
|
||||||
|
* This should only be used on the older files, for most uses you
|
||||||
|
* should call {@link WordExtractor} which deals properly
|
||||||
|
* with HWPF.
|
||||||
|
*
|
||||||
|
* @author Nick Burch
|
||||||
|
*/
|
||||||
|
public final class Word6Extractor extends POIOLE2TextExtractor {
|
||||||
|
private POIFSFileSystem fs;
|
||||||
|
private HWPFOldDocument doc;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new Word Extractor
|
||||||
|
* @param is InputStream containing the word file
|
||||||
|
*/
|
||||||
|
public Word6Extractor(InputStream is) throws IOException {
|
||||||
|
this( new POIFSFileSystem(is) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new Word Extractor
|
||||||
|
* @param fs POIFSFileSystem containing the word file
|
||||||
|
*/
|
||||||
|
public Word6Extractor(POIFSFileSystem fs) throws IOException {
|
||||||
|
this(fs.getRoot(), fs);
|
||||||
|
}
|
||||||
|
public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||||
|
this(new HWPFOldDocument(dir,fs));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new Word Extractor
|
||||||
|
* @param doc The HWPFOldDocument to extract from
|
||||||
|
*/
|
||||||
|
public Word6Extractor(HWPFOldDocument doc) {
|
||||||
|
super(doc);
|
||||||
|
this.doc = doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getText() {
|
||||||
|
StringBuffer text = new StringBuffer();
|
||||||
|
for(TextAndCHPX tchpx : doc.getContents()) {
|
||||||
|
text.append( Range.stripFields(tchpx.getText()) );
|
||||||
|
}
|
||||||
|
return text.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
* You should use either getParagraphText() or getText() unless
|
* You should use either getParagraphText() or getText() unless
|
||||||
* you have a strong reason otherwise.
|
* you have a strong reason otherwise.
|
||||||
*
|
*
|
||||||
* @author Nick Burch (nick at torchbox dot com)
|
* @author Nick Burch
|
||||||
*/
|
*/
|
||||||
public final class WordExtractor extends POIOLE2TextExtractor {
|
public final class WordExtractor extends POIOLE2TextExtractor {
|
||||||
private POIFSFileSystem fs;
|
private POIFSFileSystem fs;
|
||||||
|
|
|
@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode
|
||||||
CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0);
|
CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0);
|
||||||
return props;
|
return props;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "CHPX from " + getStart() + " to " + getEnd() +
|
||||||
|
" (in bytes " + getStartBytes() + " to " + getEndBytes() + ")";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,77 @@
|
||||||
|
/* ====================================================================
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==================================================================== */
|
||||||
|
|
||||||
|
package org.apache.poi.hwpf.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.poi.poifs.common.POIFSConstants;
|
||||||
|
import org.apache.poi.util.LittleEndian;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class holds all of the character formatting
|
||||||
|
* properties from Old (Word 6 / Word 95) documents.
|
||||||
|
* Unlike with Word 97+, it all gets held in the
|
||||||
|
* same stream.
|
||||||
|
* In common with the rest of the old support, it
|
||||||
|
* is read only
|
||||||
|
*/
|
||||||
|
public final class OldCHPBinTable
|
||||||
|
{
|
||||||
|
/** List of character properties.*/
|
||||||
|
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor used to read an old-style binTable
|
||||||
|
* in from a Word document.
|
||||||
|
*
|
||||||
|
* @param documentStream
|
||||||
|
* @param offset
|
||||||
|
* @param size
|
||||||
|
* @param fcMin
|
||||||
|
*/
|
||||||
|
public OldCHPBinTable(byte[] documentStream, int offset,
|
||||||
|
int size, int fcMin, TextPieceTable tpt)
|
||||||
|
{
|
||||||
|
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
|
||||||
|
|
||||||
|
int length = binTable.length();
|
||||||
|
for (int x = 0; x < length; x++)
|
||||||
|
{
|
||||||
|
GenericPropertyNode node = binTable.getProperty(x);
|
||||||
|
|
||||||
|
int pageNum = LittleEndian.getShort(node.getBytes());
|
||||||
|
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
|
||||||
|
|
||||||
|
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
|
||||||
|
pageOffset, fcMin, tpt);
|
||||||
|
|
||||||
|
int fkpSize = cfkp.size();
|
||||||
|
|
||||||
|
for (int y = 0; y < fkpSize; y++)
|
||||||
|
{
|
||||||
|
_textRuns.add(cfkp.getCHPX(y));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<CHPX> getTextRuns()
|
||||||
|
{
|
||||||
|
return _textRuns;
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.poi.POIDataSamples;
|
||||||
import org.apache.poi.hwpf.HWPFDocument;
|
import org.apache.poi.hwpf.HWPFDocument;
|
||||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||||
|
import org.apache.poi.hwpf.OldWordFileFormatException;
|
||||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||||
import org.apache.poi.POIDataSamples;
|
|
||||||
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the different routes to extracting text
|
* Test the different routes to extracting text
|
||||||
|
@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase {
|
||||||
|
|
||||||
assertTrue(b.toString().contains("TestComment"));
|
assertTrue(b.toString().contains("TestComment"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testWord95() throws Exception {
|
||||||
|
// Too old for the default
|
||||||
|
try {
|
||||||
|
extractor = new WordExtractor(
|
||||||
|
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
|
||||||
|
);
|
||||||
|
fail();
|
||||||
|
} catch(OldWordFileFormatException e) {}
|
||||||
|
|
||||||
|
// Can work with the special one
|
||||||
|
Word6Extractor w6e = new Word6Extractor(
|
||||||
|
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
|
||||||
|
);
|
||||||
|
String text = w6e.getText();
|
||||||
|
|
||||||
|
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
|
||||||
|
assertTrue(text.contains("Paragraph 2"));
|
||||||
|
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
|
||||||
|
assertTrue(text.contains("Last (4th) paragraph"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWord6() throws Exception {
|
||||||
|
// Too old for the default
|
||||||
|
try {
|
||||||
|
extractor = new WordExtractor(
|
||||||
|
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
|
||||||
|
);
|
||||||
|
fail();
|
||||||
|
} catch(OldWordFileFormatException e) {}
|
||||||
|
|
||||||
|
Word6Extractor w6e = new Word6Extractor(
|
||||||
|
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
|
||||||
|
);
|
||||||
|
String text = w6e.getText();
|
||||||
|
|
||||||
|
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue