mirror of https://github.com/apache/poi.git
Basic text extraction support for old Word 6 and Word 95 documents via some HWPF extensions
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@959346 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7ae1a20f07
commit
999aecbaa1
|
@ -17,26 +17,43 @@
|
|||
|
||||
package org.apache.poi.hwpf;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PushbackInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.hwpf.model.CHPBinTable;
|
||||
import org.apache.poi.hwpf.model.CPSplitCalculator;
|
||||
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||
import org.apache.poi.hwpf.model.DocumentProperties;
|
||||
import org.apache.poi.hwpf.model.EscherRecordHolder;
|
||||
import org.apache.poi.hwpf.model.FSPATable;
|
||||
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||
import org.apache.poi.hwpf.model.FontTable;
|
||||
import org.apache.poi.hwpf.model.GenericPropertyNode;
|
||||
import org.apache.poi.hwpf.model.ListTables;
|
||||
import org.apache.poi.hwpf.model.PAPBinTable;
|
||||
import org.apache.poi.hwpf.model.PicturesTable;
|
||||
import org.apache.poi.hwpf.model.PlexOfCps;
|
||||
import org.apache.poi.hwpf.model.PropertyNode;
|
||||
import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
|
||||
import org.apache.poi.hwpf.model.SavedByTable;
|
||||
import org.apache.poi.hwpf.model.SectionTable;
|
||||
import org.apache.poi.hwpf.model.ShapesTable;
|
||||
import org.apache.poi.hwpf.model.StyleSheet;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||
import org.apache.poi.hwpf.model.io.HWPFFileSystem;
|
||||
import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
||||
import org.apache.poi.hwpf.usermodel.HWPFList;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
|
||||
import org.apache.poi.hwpf.model.*;
|
||||
import org.apache.poi.hwpf.model.io.*;
|
||||
import org.apache.poi.hwpf.usermodel.*;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -46,17 +63,11 @@ import org.apache.poi.hwpf.usermodel.*;
|
|||
*
|
||||
* @author Ryan Ackley
|
||||
*/
|
||||
public final class HWPFDocument extends POIDocument
|
||||
// implements Cloneable
|
||||
public final class HWPFDocument extends HWPFDocumentCore
|
||||
{
|
||||
/** The FIB */
|
||||
protected FileInformationBlock _fib;
|
||||
/** And for making sense of CP lengths in the FIB */
|
||||
protected CPSplitCalculator _cpSplit;
|
||||
|
||||
/** main document stream buffer*/
|
||||
protected byte[] _mainStream;
|
||||
|
||||
/** table stream buffer*/
|
||||
protected byte[] _tableStream;
|
||||
|
||||
|
@ -110,29 +121,7 @@ public final class HWPFDocument extends POIDocument
|
|||
|
||||
protected HWPFDocument()
|
||||
{
|
||||
super(null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takens an InputStream, verifies that it's not RTF, builds a
|
||||
* POIFSFileSystem from it, and returns that.
|
||||
*/
|
||||
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
|
||||
// Open a PushbackInputStream, so we can peek at the first few bytes
|
||||
PushbackInputStream pis = new PushbackInputStream(istream,6);
|
||||
byte[] first6 = new byte[6];
|
||||
pis.read(first6);
|
||||
|
||||
// Does it start with {\rtf ? If so, it's really RTF
|
||||
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
|
||||
&& first6[3] == 't' && first6[4] == 'f') {
|
||||
throw new IllegalArgumentException("The document is really a RTF file");
|
||||
}
|
||||
|
||||
// OK, so it's not RTF
|
||||
// Open a POIFSFileSystem on the (pushed back) stream
|
||||
pis.unread(first6);
|
||||
return new POIFSFileSystem(pis);
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -171,21 +160,16 @@ public final class HWPFDocument extends POIDocument
|
|||
*/
|
||||
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
|
||||
{
|
||||
// Sort out the hpsf properties
|
||||
// Load the main stream and FIB
|
||||
// Also handles HPSF bits
|
||||
super(directory, pfilesystem);
|
||||
|
||||
// read in the main stream.
|
||||
DocumentEntry documentProps = (DocumentEntry)
|
||||
directory.getEntry("WordDocument");
|
||||
_mainStream = new byte[documentProps.getSize()];
|
||||
|
||||
directory.createDocumentInputStream("WordDocument").read(_mainStream);
|
||||
|
||||
// Create our FIB, and check for the doc being encrypted
|
||||
_fib = new FileInformationBlock(_mainStream);
|
||||
// Do the CP Split
|
||||
_cpSplit = new CPSplitCalculator(_fib);
|
||||
if(_fib.isFEncrypted()) {
|
||||
throw new EncryptedDocumentException("Cannot process encrypted word files!");
|
||||
|
||||
// Is this document too old for us?
|
||||
if(_fib.getNFib() < 106) {
|
||||
throw new OldWordFileFormatException("The document is too old (Word 95 or older) ");
|
||||
}
|
||||
|
||||
// use the fib to determine the name of the table stream.
|
||||
|
@ -691,17 +675,4 @@ public final class HWPFDocument extends POIDocument
|
|||
t.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
// public Object clone()
|
||||
// throws CloneNotSupportedException
|
||||
// {
|
||||
// _tpt;
|
||||
//
|
||||
// _cbt;
|
||||
//
|
||||
// _pbt;
|
||||
//
|
||||
// _st;
|
||||
//
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.PushbackInputStream;
|
||||
|
||||
import org.apache.poi.EncryptedDocumentException;
|
||||
import org.apache.poi.POIDocument;
|
||||
import org.apache.poi.hwpf.model.FileInformationBlock;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.DocumentEntry;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
|
||||
/**
|
||||
* This class holds much of the core of a Word document, but
|
||||
* without some of the table structure information.
|
||||
* You generally want to work with one of
|
||||
* {@link HWPFDocument} or {@link HWPFOldDocument}
|
||||
*/
|
||||
public abstract class HWPFDocumentCore extends POIDocument
|
||||
{
|
||||
/** The FIB */
|
||||
protected FileInformationBlock _fib;
|
||||
|
||||
/** main document stream buffer*/
|
||||
protected byte[] _mainStream;
|
||||
|
||||
protected HWPFDocumentCore()
|
||||
{
|
||||
super(null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takens an InputStream, verifies that it's not RTF, builds a
|
||||
* POIFSFileSystem from it, and returns that.
|
||||
*/
|
||||
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
|
||||
// Open a PushbackInputStream, so we can peek at the first few bytes
|
||||
PushbackInputStream pis = new PushbackInputStream(istream,6);
|
||||
byte[] first6 = new byte[6];
|
||||
pis.read(first6);
|
||||
|
||||
// Does it start with {\rtf ? If so, it's really RTF
|
||||
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
|
||||
&& first6[3] == 't' && first6[4] == 'f') {
|
||||
throw new IllegalArgumentException("The document is really a RTF file");
|
||||
}
|
||||
|
||||
// OK, so it's not RTF
|
||||
// Open a POIFSFileSystem on the (pushed back) stream
|
||||
pis.unread(first6);
|
||||
return new POIFSFileSystem(pis);
|
||||
}
|
||||
|
||||
/**
|
||||
* This constructor loads a Word document from an InputStream.
|
||||
*
|
||||
* @param istream The InputStream that contains the Word document.
|
||||
* @throws IOException If there is an unexpected IOException from the passed
|
||||
* in InputStream.
|
||||
*/
|
||||
public HWPFDocumentCore(InputStream istream) throws IOException
|
||||
{
|
||||
//do Ole stuff
|
||||
this( verifyAndBuildPOIFS(istream) );
|
||||
}
|
||||
|
||||
/**
|
||||
* This constructor loads a Word document from a POIFSFileSystem
|
||||
*
|
||||
* @param pfilesystem The POIFSFileSystem that contains the Word document.
|
||||
* @throws IOException If there is an unexpected IOException from the passed
|
||||
* in POIFSFileSystem.
|
||||
*/
|
||||
public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException
|
||||
{
|
||||
this(pfilesystem.getRoot(), pfilesystem);
|
||||
}
|
||||
|
||||
/**
|
||||
* This constructor loads a Word document from a specific point
|
||||
* in a POIFSFileSystem, probably not the default.
|
||||
* Used typically to open embeded documents.
|
||||
*
|
||||
* @param pfilesystem The POIFSFileSystem that contains the Word document.
|
||||
* @throws IOException If there is an unexpected IOException from the passed
|
||||
* in POIFSFileSystem.
|
||||
*/
|
||||
public HWPFDocumentCore(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
|
||||
{
|
||||
// Sort out the hpsf properties
|
||||
super(directory, pfilesystem);
|
||||
|
||||
// read in the main stream.
|
||||
DocumentEntry documentProps = (DocumentEntry)
|
||||
directory.getEntry("WordDocument");
|
||||
_mainStream = new byte[documentProps.getSize()];
|
||||
|
||||
directory.createDocumentInputStream("WordDocument").read(_mainStream);
|
||||
|
||||
// Create our FIB, and check for the doc being encrypted
|
||||
_fib = new FileInformationBlock(_mainStream);
|
||||
if(_fib.isFEncrypted()) {
|
||||
throw new EncryptedDocumentException("Cannot process encrypted word files!");
|
||||
}
|
||||
}
|
||||
|
||||
public FileInformationBlock getFileInformationBlock()
|
||||
{
|
||||
return _fib;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,135 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hwpf;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.hwpf.model.CHPX;
|
||||
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
||||
import org.apache.poi.hwpf.model.PieceDescriptor;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
/**
|
||||
* Provides very simple support for old (Word 6 / Word 95)
|
||||
* files.
|
||||
* TODO Provide a way to get at the properties associated
|
||||
* with each block of text
|
||||
*/
|
||||
public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
private List<TextAndCHPX> contents = new ArrayList<TextAndCHPX>();
|
||||
|
||||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
|
||||
public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
|
||||
throws IOException {
|
||||
super(directory, fs);
|
||||
|
||||
// Where are things?
|
||||
int chpTableOffset = LittleEndian.getInt(_mainStream, 0xb8);
|
||||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
||||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
||||
|
||||
// We need to get hold of the text that makes up the
|
||||
// document, which might be regular or fast-saved
|
||||
StringBuffer text = new StringBuffer();
|
||||
TextPieceTable tpt;
|
||||
if(_fib.isFComplex()) {
|
||||
ComplexFileTable cft = new ComplexFileTable(
|
||||
_mainStream, _mainStream,
|
||||
complexTableOffset, _fib.getFcMin()
|
||||
);
|
||||
tpt = cft.getTextPieceTable();
|
||||
|
||||
for(TextPiece tp : tpt.getTextPieces()) {
|
||||
text.append( tp.getStringBuffer() );
|
||||
}
|
||||
} else {
|
||||
// TODO Build the Piece Descriptor properly
|
||||
// TODO Can these old documents ever contain Unicode strings?
|
||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
|
||||
pd.setFilePosition(_fib.getFcMin());
|
||||
|
||||
tpt = new TextPieceTable();
|
||||
byte[] textData = new byte[_fib.getFcMac()-_fib.getFcMin()];
|
||||
System.arraycopy(_mainStream, _fib.getFcMin(), textData, 0, textData.length);
|
||||
TextPiece tp = new TextPiece(
|
||||
0, textData.length, textData, pd, 0
|
||||
);
|
||||
tpt.getTextPieces().add(tp);
|
||||
|
||||
text.append(tp.getStringBuffer());
|
||||
}
|
||||
|
||||
// Now we can fetch the character and paragraph properties
|
||||
OldCHPBinTable chpTable = new OldCHPBinTable(
|
||||
_mainStream, chpTableOffset, chpTableSize,
|
||||
_fib.getFcMin(), tpt
|
||||
);
|
||||
|
||||
// Finally build up runs
|
||||
for(CHPX chpx : chpTable.getTextRuns()) {
|
||||
String str = text.substring(chpx.getStart(), chpx.getEnd());
|
||||
contents.add(new TextAndCHPX(str,chpx));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(OutputStream out) throws IOException {
|
||||
throw new IllegalStateException("Writing is not available for the older file formats");
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves all our text, in order, along with the
|
||||
* CHPX information on each bit.
|
||||
* Every entry has the same formatting, but as yet
|
||||
* we've no way to tell what the formatting is...
|
||||
* Warnings - this will change as soon as we support
|
||||
* text formatting!
|
||||
*/
|
||||
public List<TextAndCHPX> getContents() {
|
||||
return contents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Warnings - this will change as soon as we support
|
||||
* text formatting!
|
||||
*/
|
||||
public static class TextAndCHPX {
|
||||
private String text;
|
||||
private CHPX chpx;
|
||||
private TextAndCHPX(String text, CHPX chpx) {
|
||||
this.text = text;
|
||||
this.chpx = chpx;
|
||||
}
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
public CHPX getChpx() {
|
||||
return chpx;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hwpf;
|
||||
|
||||
import org.apache.poi.OldFileFormatException;
|
||||
|
||||
public class OldWordFileFormatException extends OldFileFormatException {
|
||||
public OldWordFileFormatException(String s) {
|
||||
super(s);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.extractor;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.poi.POIOLE2TextExtractor;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument.TextAndCHPX;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
|
||||
/**
|
||||
* Class to extract the text from old (Word 6 / Word 95) Word Documents.
|
||||
*
|
||||
* This should only be used on the older files, for most uses you
|
||||
* should call {@link WordExtractor} which deals properly
|
||||
* with HWPF.
|
||||
*
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public final class Word6Extractor extends POIOLE2TextExtractor {
|
||||
private POIFSFileSystem fs;
|
||||
private HWPFOldDocument doc;
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param is InputStream containing the word file
|
||||
*/
|
||||
public Word6Extractor(InputStream is) throws IOException {
|
||||
this( new POIFSFileSystem(is) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param fs POIFSFileSystem containing the word file
|
||||
*/
|
||||
public Word6Extractor(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot(), fs);
|
||||
}
|
||||
public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
|
||||
this(new HWPFOldDocument(dir,fs));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Word Extractor
|
||||
* @param doc The HWPFOldDocument to extract from
|
||||
*/
|
||||
public Word6Extractor(HWPFOldDocument doc) {
|
||||
super(doc);
|
||||
this.doc = doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
StringBuffer text = new StringBuffer();
|
||||
for(TextAndCHPX tchpx : doc.getContents()) {
|
||||
text.append( Range.stripFields(tchpx.getText()) );
|
||||
}
|
||||
return text.toString();
|
||||
}
|
||||
}
|
|
@ -40,7 +40,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
|||
* You should use either getParagraphText() or getText() unless
|
||||
* you have a strong reason otherwise.
|
||||
*
|
||||
* @author Nick Burch (nick at torchbox dot com)
|
||||
* @author Nick Burch
|
||||
*/
|
||||
public final class WordExtractor extends POIOLE2TextExtractor {
|
||||
private POIFSFileSystem fs;
|
||||
|
|
|
@ -61,4 +61,9 @@ public final class CHPX extends BytePropertyNode
|
|||
CharacterProperties props = CharacterSprmUncompressor.uncompressCHP(baseStyle, getGrpprl(), 0);
|
||||
return props;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "CHPX from " + getStart() + " to " + getEnd() +
|
||||
" (in bytes " + getStartBytes() + " to " + getEndBytes() + ")";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.poifs.common.POIFSConstants;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
|
||||
/**
|
||||
* This class holds all of the character formatting
|
||||
* properties from Old (Word 6 / Word 95) documents.
|
||||
* Unlike with Word 97+, it all gets held in the
|
||||
* same stream.
|
||||
* In common with the rest of the old support, it
|
||||
* is read only
|
||||
*/
|
||||
public final class OldCHPBinTable
|
||||
{
|
||||
/** List of character properties.*/
|
||||
protected ArrayList<CHPX> _textRuns = new ArrayList<CHPX>();
|
||||
|
||||
/**
|
||||
* Constructor used to read an old-style binTable
|
||||
* in from a Word document.
|
||||
*
|
||||
* @param documentStream
|
||||
* @param offset
|
||||
* @param size
|
||||
* @param fcMin
|
||||
*/
|
||||
public OldCHPBinTable(byte[] documentStream, int offset,
|
||||
int size, int fcMin, TextPieceTable tpt)
|
||||
{
|
||||
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
|
||||
|
||||
int length = binTable.length();
|
||||
for (int x = 0; x < length; x++)
|
||||
{
|
||||
GenericPropertyNode node = binTable.getProperty(x);
|
||||
|
||||
int pageNum = LittleEndian.getShort(node.getBytes());
|
||||
int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;
|
||||
|
||||
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
|
||||
pageOffset, fcMin, tpt);
|
||||
|
||||
int fkpSize = cfkp.size();
|
||||
|
||||
for (int y = 0; y < fkpSize; y++)
|
||||
{
|
||||
_textRuns.add(cfkp.getCHPX(y));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<CHPX> getTextRuns()
|
||||
{
|
||||
return _textRuns;
|
||||
}
|
||||
}
|
|
@ -19,13 +19,12 @@ package org.apache.poi.hwpf.extractor;
|
|||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||
import org.apache.poi.hwpf.OldWordFileFormatException;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.POIDataSamples;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
|
||||
/**
|
||||
* Test the different routes to extracting text
|
||||
|
@ -237,4 +236,42 @@ public final class TestWordExtractor extends TestCase {
|
|||
|
||||
assertTrue(b.toString().contains("TestComment"));
|
||||
}
|
||||
|
||||
public void testWord95() throws Exception {
|
||||
// Too old for the default
|
||||
try {
|
||||
extractor = new WordExtractor(
|
||||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
|
||||
);
|
||||
fail();
|
||||
} catch(OldWordFileFormatException e) {}
|
||||
|
||||
// Can work with the special one
|
||||
Word6Extractor w6e = new Word6Extractor(
|
||||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word95.doc")
|
||||
);
|
||||
String text = w6e.getText();
|
||||
|
||||
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
|
||||
assertTrue(text.contains("Paragraph 2"));
|
||||
assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
|
||||
assertTrue(text.contains("Last (4th) paragraph"));
|
||||
}
|
||||
|
||||
public void testWord6() throws Exception {
|
||||
// Too old for the default
|
||||
try {
|
||||
extractor = new WordExtractor(
|
||||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
|
||||
);
|
||||
fail();
|
||||
} catch(OldWordFileFormatException e) {}
|
||||
|
||||
Word6Extractor w6e = new Word6Extractor(
|
||||
POIDataSamples.getDocumentInstance().openResourceAsStream("Word6.doc")
|
||||
);
|
||||
String text = w6e.getText();
|
||||
|
||||
assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue