bug 50955 -- word 6.0 charset fix

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790061 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-04-04 02:06:46 +00:00
parent bd5ea8d3b7
commit 6fe3b75bfd
24 changed files with 971 additions and 64 deletions

View File

@ -218,6 +218,9 @@ public class TestAllFiles {
"document/Word6_sections2.doc",
"document/Word95.doc",
"document/word95err.doc",
"document/Bug60936.doc",
"document/Bug60942.doc",
"document/Bug60942b.doc",
"hpsf/TestMickey.doc",
"document/52117.doc"
);

View File

@ -18,6 +18,9 @@
package org.apache.poi.util;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Set;
/**
* Utilities for working with Microsoft CodePages.
@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingException;
*/
public class CodePageUtil
{
public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>();
static {
//others?
VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
}
/** <p>Codepage 037, a special case</p> */
public static final int CP_037 = 37;

View File

@ -0,0 +1,107 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.util;
import java.io.ByteArrayInputStream;
/**
* Stream that converts MSOffice's way of storing Big5, with
* zero-byte padding for ASCII and in LittleEndianOrder.
*/
@Internal
public class LittleEndianBig5Stream extends ByteArrayInputStream {
private static final int EOF = -1;
private static final int INVALID_PAIR = -2;
private static final int EMPTY_TRAILING = -3;
//the char that is logically trailing in Big5 encoding
//however in LittleEndian order, this is the first encountered.
int trailing = EMPTY_TRAILING;
public LittleEndianBig5Stream(byte[] buf) {
super(buf);
}
public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
super(buf, offset, length);
}
@Override
public int read() {
if (trailing != EMPTY_TRAILING) {
int tmp = trailing;
trailing = EMPTY_TRAILING;
return tmp;
}
int leading = readNext();
while (leading == INVALID_PAIR) {
leading = readNext();
}
if (leading == EOF) {
return EOF;
}
return leading;
}
//returns leading, sets trailing appropriately
//returns -1 if it hits the end of the stream
//returns -2 for an invalid big5 code pair
private final int readNext() {
trailing = super.read();
if (trailing == -1) {
return EOF;
}
int leading = super.read();
if (leading == EOF) {
return EOF;
}
int lead = leading&0xff;
if (lead > 0x80) {
return leading;
} else if (lead == 0) {
int ret = trailing;
trailing = EMPTY_TRAILING;
return ret;
} else {
int ret = trailing;
trailing = EMPTY_TRAILING;
return ret;
//return INVALID_PAIR;
}
}
@Override
public int read(byte[] buff, int off, int len) {
int bytesRead = 0;
for (int i = off; i < off+len; i++) {
int b = read();
if (b == -1) {
if (bytesRead == 0) {
return -1;
} else {
return bytesRead;
}
}
bytesRead++;
buff[i] = (byte)b;
}
return bytesRead;
}
}

View File

@ -17,6 +17,8 @@
package org.apache.poi.util;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
@ -27,9 +29,14 @@ import java.util.Map;
*/
@Internal
public class StringUtil {
private static final POILogger logger = POILogFactory
.getLogger(StringUtil.class);
protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
public static final Charset UTF16LE = Charset.forName("UTF-16LE");
public static final Charset UTF8 = Charset.forName("UTF-8");
public static final Charset WIN_1252 = Charset.forName("cp1252");
public static final Charset BIG5 = Charset.forName("Big5");
private static Map<Integer,Integer> msCodepointToUnicode;
@ -573,7 +580,28 @@ public class StringUtil {
9133, // 0xf0fe bracerightbt
' ', // 0xf0ff not defined
};
/**
* This tries to convert a LE byte array in Big5 to a String.
* We know MS zero-padded ascii, and we drop those.
* However, there may be areas for improvement in this.
*
* @param data
* @param offset
* @param lengthInBytes
* @return
*/
public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
ByteArrayOutputStream os = new ByteArrayOutputStream();
try {
IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
} catch (IOException e) {
logger.log(POILogger.WARN,
"IOException while copying a byte array stream to a byte array stream?!");
}
return new String(os.toByteArray(), BIG5);
}
// Could be replaced with org.apache.commons.lang3.StringUtils#join
@Internal
public static String join(Object[] array, String separator) {

View File

@ -108,7 +108,7 @@ public class HwmfFont {
return charset;
}
static WmfCharset valueOf(int flag) {
public static WmfCharset valueOf(int flag) {
for (WmfCharset cs : values()) {
if (cs.flag == flag) return cs;
}

View File

@ -19,27 +19,43 @@ package org.apache.poi.hwpf;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.Charset;
import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwpf.model.ComplexFileTable;
import org.apache.poi.hwpf.model.FontTable;
import org.apache.poi.hwpf.model.OldCHPBinTable;
import org.apache.poi.hwpf.model.OldComplexFileTable;
import org.apache.poi.hwpf.model.OldFfn;
import org.apache.poi.hwpf.model.OldFontTable;
import org.apache.poi.hwpf.model.OldPAPBinTable;
import org.apache.poi.hwpf.model.OldSectionTable;
import org.apache.poi.hwpf.model.OldTextPieceTable;
import org.apache.poi.hwpf.model.PieceDescriptor;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.NotImplemented;
import org.apache.poi.util.StringUtil;
/**
* Provides very simple support for old (Word 6 / Word 95)
* files.
*/
public class HWPFOldDocument extends HWPFDocumentCore {
private TextPieceTable tpt;
private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
private OldTextPieceTable tpt;
private StringBuilder _text;
private final OldFontTable fontTable;
private final Charset guessedCharset;
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
this(fs.getRoot());
@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWPFDocumentCore {
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize);
//TODO: figure out how to map runs/text pieces to fonts
//for now, if there's a non standard codepage in one of the fonts
//assume that the doc is in that codepage.
guessedCharset = guessCodePage(fontTable);
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
// We need to get hold of the text that makes up the
// document, which might be regular or fast-saved
ComplexFileTable cft = null;
StringBuffer text = new StringBuffer();
if(_fib.getFibBase().isFComplex()) {
cft = new ComplexFileTable(
cft = new OldComplexFileTable(
_mainStream, _mainStream,
complexTableOffset, _fib.getFibBase().getFcMin()
complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset
);
tpt = cft.getTextPieceTable();
tpt = (OldTextPieceTable)cft.getTextPieceTable();
for(TextPiece tp : tpt.getTextPieces()) {
text.append( tp.getStringBuilder() );
}
} else {
// TODO Discover if these older documents can ever hold Unicode Strings?
// (We think not, because they seem to lack a Piece table)
// TODO Build the Piece Descriptor properly
// (We have to fake it, as they don't seem to have a proper Piece table)
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
pd.setFilePosition(_fib.getFibBase().getFcMin());
// Generate a single Text Piece Table, with a single Text Piece
// which covers all the (8 bit only) text in the file
tpt = new TextPieceTable();
tpt = new OldTextPieceTable();
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
int numChars = textData.length;
if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
numChars /= 2;
}
TextPiece tp = new TextPiece(
0, textData.length, textData, pd
0, numChars, textData, pd
);
tpt.add(tp);
text.append(tp.getStringBuilder());
}
_text = tpt.getText();
// Now we can fetch the character and paragraph properties
@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWPFDocumentCore {
}
}
/**
* Take the first codepage that is not default, ansi or symbol.
* Ideally, we'd want to track fonts with runs, but we don't yet
* know how to do that.
*
* Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi
* appears here.
*
* @param fontTable
* @return
*/
private Charset guessCodePage(OldFontTable fontTable) {
for (OldFfn oldFfn : fontTable.getFontNames()) {
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
if (wmfCharset != null &&
wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
return wmfCharset.getCharset();
}
}
return DEFAULT_CHARSET;
}
public Range getOverallRange()
{
// Life is easy when we have no footers, headers or unicode!
return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
}
/**
* Use {@link #getOldFontTable()} instead!!!
* This always throws an IllegalArgumentException.
*
* @return nothing
* @throws UnsupportedOperationException
*/
@Override
@NotImplemented
public FontTable getFontTable() {
throw new UnsupportedOperationException("Use getOldFontTable instead.");
}
public OldFontTable getOldFontTable() {
return fontTable;
}
public Range getRange()
{
return getOverallRange();
@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
public void write(OutputStream out) throws IOException {
throw new IllegalStateException("Writing is not available for the older file formats");
}
/**
* As a rough heuristic (total hack), read through the font table
* and take the first non-default, non-ansi, non-symbol
* font's charset and return that.
*
* Once we figure out how to link a font to a text piece, we should
* use the font information per text piece.
*
* @return charset
*/
public Charset getGuessedCharset() {
return guessedCharset;
}
}

View File

@ -18,6 +18,7 @@
package org.apache.poi.hwpf.model;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPFOutputStream;
import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
@Internal
public final class ComplexFileTable {
public class ComplexFileTable {
private static final byte GRPPRL_TYPE = 1;
private static final byte TEXT_PIECE_TABLE_TYPE = 2;
@ -40,7 +42,8 @@ public final class ComplexFileTable {
_tpt = new TextPieceTable();
}
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin,
Charset charset) throws IOException {
//skips through the prms before we reach the piece table. These contain data
//for actual fast saved files
List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
@ -61,7 +64,12 @@ public final class ComplexFileTable {
}
int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
offset += LittleEndian.INT_SIZE;
_tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
_tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
}
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252);
}
public TextPieceTable getTextPieceTable() {
@ -92,4 +100,11 @@ public final class ComplexFileTable {
tableStream.write(table);
}
protected TextPieceTable newTextPieceTable(byte[] documentStream,
byte[] tableStream, int offset, int pieceTableSize, int fcMin,
Charset charset) {
return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
}
}

View File

@ -44,7 +44,7 @@ public final class OldCHPBinTable extends CHPBinTable
* @param fcMin
*/
public OldCHPBinTable(byte[] documentStream, int offset,
int size, int fcMin, TextPieceTable tpt)
int size, int fcMin, OldTextPieceTable tpt)
{
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);

View File

@ -0,0 +1,42 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import java.io.IOException;
import java.nio.charset.Charset;
import org.apache.poi.util.Internal;
@Internal
public final class OldComplexFileTable extends ComplexFileTable {
public OldComplexFileTable(byte[] documentStream, byte[] tableStream,
int offset, int fcMin, Charset charset) throws IOException {
super(documentStream, tableStream, offset, fcMin, charset);
}
@Override
protected TextPieceTable newTextPieceTable(byte[] documentStream,
byte[] tableStream, int offset,
int pieceTableSize, int fcMin, Charset charset) {
return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
}
}

View File

@ -0,0 +1,161 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import java.nio.charset.Charset;
import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.apache.poi.util.StringUtil;
/**
* Word 6.0 Font information
*/
@Internal
public final class OldFfn {
private static final POILogger logger = POILogFactory.getLogger(OldFfn.class);
private byte _chs;// character set identifier
private final String fontName;
private final String altFontName;
private final int length; //length in bytes for this record
/**
* try to read an OldFfn starting at offset; read no farther than end
*
* @param buf buffer from which to read
* @param offset offset at which to start
* @param fontTableEnd read no farther than this
* @return an OldFfn or null if asked to read beyond end
*/
static OldFfn build(byte[] buf, int offset, int fontTableEnd) {
int start = offset;
//preliminary bytes
if (offset + 6 > fontTableEnd) {
return null;
}
//first byte
short fontDescriptionLength = (short) buf[offset];
offset += 1;
if (offset + fontDescriptionLength > fontTableEnd) {
logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font");
return null;
}
//no idea what these 3 bytes do
offset += 3;
byte chs = buf[offset];
Charset charset = null;
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff);
if (wmfCharset == null) {
logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff));
} else {
charset = wmfCharset.getCharset();
}
charset = charset == null ? StringUtil.WIN_1252 : charset;
offset += LittleEndian.BYTE_SIZE;
//if this byte here == 7, it _may_ signify existence of
//an altername font name
//not sure what the byte after the _chs does
offset += LittleEndian.BYTE_SIZE;
int fontNameLength = -1;
for (int i = offset; i < fontTableEnd; i++) {
if (buf[i] == 0) {
fontNameLength = i - offset;
break;
}
}
if (fontNameLength == -1) {
logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length");
return null;
}
String fontName = new String(buf, offset, fontNameLength, charset);
String altFontName = null;
int altFontNameLength = -1;
offset += fontNameLength + 1;
if (offset - start < fontDescriptionLength) {
for (int i = offset; i <= start + fontDescriptionLength; i++) {
if (buf[i] == 0) {
altFontNameLength = i - offset;
break;
}
}
if (altFontNameLength > -1) {
altFontName = new String(buf, offset, altFontNameLength, charset);
}
}
//reset to 0 for length calculation
altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte
int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes
fontNameLength + altFontNameLength + 1;//+1 is for the zero byte
//this len should == fontDescriptionLength
return new OldFfn(chs, fontName, altFontName, len);
}
public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) {
this._chs = charsetIdentifier;
this.fontName = fontName;
this.altFontName = altFontName;
this.length = length;
}
public byte getChs() {
return _chs;
}
public String getMainFontName() {
return fontName;
}
/**
* @return altFontName if it exists, null otherwise
*/
public String getAltFontName() {
return altFontName;
}
/**
* @return length in bytes for this record
*/
public int getLength() {
return length;
}
@Override
public String toString() {
return "OldFfn{" +
"_chs=" + (_chs & 0xff) +
", fontName='" + fontName + '\'' +
", altFontName='" + altFontName + '\'' +
", length=" + length +
'}';
}
}

View File

@ -0,0 +1,84 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
/**
* Font table for Word 6.0
*/
@Internal
public final class OldFontTable {
private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class);
// added extra facilitator members
// FFN structure containing strings of font names
private final OldFfn[] _fontNames;
public OldFontTable(byte[] buf, int offset, int length) {
//length is stored at the index section in the table
//and it is recorded in the first short.
List<OldFfn> ffns = new ArrayList<OldFfn>();
int fontTableLength = LittleEndian.getShort(buf, offset);
int endOfTableOffset = offset + length;
int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length!
while (true) {
OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset);
if (oldFfn == null) {
break;
}
ffns.add(oldFfn);
startOffset += oldFfn.getLength();
}
_fontNames = ffns.toArray(new OldFfn[ffns.size()]);
}
public OldFfn[] getFontNames() {
return _fontNames;
}
public String getMainFont(int chpFtc) {
if (chpFtc >= _fontNames.length) {
_logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
return null;
}
return _fontNames[chpFtc].getMainFontName();
}
@Override
public String toString() {
return "OldFontTable{" +
"_fontNames=" + Arrays.toString(_fontNames) +
'}';
}
}

View File

@ -0,0 +1,120 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import org.apache.poi.util.Internal;
import org.apache.poi.util.NotImplemented;
/**
* Lightweight representation of a text piece.
* Works in the character domain, not the byte domain, so you
* need to have turned byte references into character
* references before getting here.
*/
@Internal
public class OldTextPiece extends TextPiece {
private final byte[] rawBytes;
/**
* @param start Beginning offset in main document stream, in characters.
* @param end Ending offset in main document stream, in characters.
* @param text The raw bytes of our text
*/
public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
super(start, end, text, pd);
this.rawBytes = text;
if (end < start) {
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
}
}
/**
* @return nothing, ever. Always throws an UnsupportedOperationException
* @throws UnsupportedOperationException
*/
@NotImplemented
@Override
public boolean isUnicode() {
throw new UnsupportedOperationException();
}
public StringBuilder getStringBuilder() {
return (StringBuilder) _buf;
}
@Override
public byte[] getRawBytes() {
byte[] buf = new byte[rawBytes.length];
System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length);
return buf;
}
/**
* Returns part of the string.
* Works only in characters, not in bytes!
*
* @param start Local start position, in characters
* @param end Local end position, in characters
* @throws UnsupportedOperationException
*/
@Deprecated
@NotImplemented
public String substring(int start, int end) {
throw new UnsupportedOperationException();
}
/**
* Not implemented for OldTextPiece.
* Always throws UnsupportedOperationException
*/
@Deprecated
@NotImplemented
public void adjustForDelete(int start, int length) {
throw new UnsupportedOperationException();
}
/**
* Returns the length, in bytes
*/
public int bytesLength() {
return rawBytes.length;
}
@Override
public int hashCode() {
assert false : "hashCode not designed";
return 42; // any arbitrary constant will do
}
/**
* Returns the character position we start at.
*/
public int getCP() {
return getStart();
}
public String toString() {
return "OldTextPiece from " + getStart() + " to " + getEnd() + " ("
+ getPieceDescriptor() + ")";
}
}

View File

@ -0,0 +1,119 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf.model;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.Internal;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
@Internal
public class OldTextPieceTable extends TextPieceTable {
private static final POILogger logger = POILogFactory
.getLogger(OldTextPieceTable.class);
public OldTextPieceTable() {
super();
}
public OldTextPieceTable(byte[] documentStream, byte[] tableStream,
int offset, int size, int fcMin, Charset charset) {
//super(documentStream, tableStream, offset, size, fcMin, charset);
// get our plex of PieceDescriptors
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size,
PieceDescriptor.getSizeInBytes());
int length = pieceTable.length();
PieceDescriptor[] pieces = new PieceDescriptor[length];
// iterate through piece descriptors raw bytes and create
// PieceDescriptor objects
for (int x = 0; x < length; x++) {
GenericPropertyNode node = pieceTable.getProperty(x);
pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset);
}
// Figure out the cp of the earliest text piece
// Note that text pieces don't have to be stored in order!
_cpMin = pieces[0].getFilePosition() - fcMin;
for (PieceDescriptor piece : pieces) {
int start = piece.getFilePosition() - fcMin;
if (start < _cpMin) {
_cpMin = start;
}
}
// using the PieceDescriptors, build our list of TextPieces.
for (int x = 0; x < pieces.length; x++) {
int start = pieces[x].getFilePosition();
GenericPropertyNode node = pieceTable.getProperty(x);
// Grab the start and end, which are in characters
int nodeStartChars = node.getStart();
int nodeEndChars = node.getEnd();
// What's the relationship between bytes and characters?
boolean unicode = pieces[x].isUnicode();
int multiple = 1;
if (unicode ||
(charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
multiple = 2;
}
// Figure out the length, in bytes and chars
int textSizeChars = (nodeEndChars - nodeStartChars);
int textSizeBytes = textSizeChars * multiple;
// Grab the data that makes up the piece
byte[] buf = new byte[textSizeBytes];
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
// And now build the piece
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
pieces[x]);
_textPieces.add(newTextPiece);
}
// In the interest of our sanity, now sort the text pieces
// into order, if they're not already
Collections.sort(_textPieces);
_textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces);
Collections.sort(_textPiecesFCOrder, new FCComparator());
}
@Override
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd);
}
@Override
protected int getEncodingMultiplier(TextPiece textPiece) {
Charset charset = textPiece.getPieceDescriptor().getCharset();
if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
return 2;
}
return 1;
}
}

View File

@ -260,7 +260,7 @@ public class PAPBinTable
SprmBuffer sprmBuffer = null;
for ( PAPX papx : papxs )
{
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 )
continue;
if ( sprmBuffer == null ) {

View File

@ -17,10 +17,13 @@
package org.apache.poi.hwpf.model;
import java.nio.charset.Charset;
import org.apache.poi.util.BitField;
import org.apache.poi.util.BitFieldFactory;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
@Internal
public final class PieceDescriptor
@ -32,29 +35,51 @@ public final class PieceDescriptor
private static BitField fCopied = BitFieldFactory.getInstance(0x04);
int fc;
PropertyModifier prm;
boolean unicode;
boolean unicode = false;
private final Charset charset;
public PieceDescriptor(byte[] buf, int offset)
{
descriptor = LittleEndian.getShort(buf, offset);
offset += LittleEndian.SHORT_SIZE;
fc = LittleEndian.getInt(buf, offset);
offset += LittleEndian.INT_SIZE;
prm = new PropertyModifier( LittleEndian.getShort(buf, offset));
// see if this piece uses unicode.
if ((fc & 0x40000000) == 0)
{
unicode = true;
}
else
{
unicode = false;
fc &= ~(0x40000000);//gives me FC in doc stream
fc /= 2;
public PieceDescriptor(byte[] buf, int offset) {
this(buf, offset, null);
}
/**
*
* This initializer should only be used for HWPFOldDocuments.
*
* @param buf
* @param offset
* @param charset which charset to use if this is not unicode
*/
public PieceDescriptor(byte[] buf, int offset, Charset charset) {
descriptor = LittleEndian.getShort(buf, offset);
offset += LittleEndian.SHORT_SIZE;
fc = LittleEndian.getInt(buf, offset);
offset += LittleEndian.INT_SIZE;
prm = new PropertyModifier(LittleEndian.getShort(buf, offset));
if (charset == null) {
// see if this piece uses unicode.
//From the documentation: If the second most significant bit
//is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the
//second most significant bit is set, then the actual address of the codepage-1252
//compressed version of the Unicode character (one byte), is actually at the offset indicated
//by clearing this bit and dividing by two.
if ((fc & 0x40000000) == 0) {
unicode = true;
this.charset = null;
} else {
unicode = false;
fc &= ~(0x40000000);//gives me FC in doc stream
fc /= 2;
this.charset = StringUtil.WIN_1252;
}
} else {
if (charset == StringUtil.UTF16LE) {
unicode = true;
}
this.charset = charset;
}
}
public int getFilePosition()
@ -72,6 +97,15 @@ public final class PieceDescriptor
return unicode;
}
/**
*
* @return charset to use if this is not a Unicode PieceDescriptor
* this can be <code>null</code>
*/
public Charset getCharset() {
return charset;
}
public PropertyModifier getPrm()
{
return prm;

View File

@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model;
import java.nio.charset.Charset;
import org.apache.poi.util.Internal;
import org.apache.poi.util.StringUtil;
/**
* Lightweight representation of a text piece.
@ -40,7 +41,6 @@ public class TextPiece extends PropertyNode<TextPiece> {
* @param start Beginning offset in main document stream, in characters.
* @param end Ending offset in main document stream, in characters.
* @param text The raw bytes of our text
* @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
* instead
*/
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
@ -72,8 +72,13 @@ public class TextPiece extends PropertyNode<TextPiece> {
* Create the StringBuilder from the text and unicode flag
*/
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252"));
byte[] textBuffer = text;
if (StringUtil.BIG5.equals(pd.getCharset())) {
String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString();
return new StringBuilder(txt);
}
String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
return new StringBuilder(str);
}
@ -207,4 +212,5 @@ public class TextPiece extends PropertyNode<TextPiece> {
return "TextPiece from " + getStart() + " to " + getEnd() + " ("
+ getPieceDescriptor() + ")";
}
}

View File

@ -101,7 +101,7 @@ public class TextPieceTable implements CharIndexTranslator {
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
// And now build the piece
final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf,
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
pieces[x]);
_textPieces.add(newTextPiece);
@ -114,6 +114,10 @@ public class TextPieceTable implements CharIndexTranslator {
Collections.sort(_textPiecesFCOrder, new FCComparator());
}
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
return new TextPiece(nodeStartChars, nodeEndChars, buf, pd);
}
public void add(TextPiece piece) {
_textPieces.add(piece);
_textPiecesFCOrder.add(piece);
@ -249,7 +253,7 @@ public class TextPieceTable implements CharIndexTranslator {
if (rangeStartBytes > rangeEndBytes)
continue;
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
final int encodingMultiplier = getEncodingMultiplier(textPiece);
final int rangeStartCp = textPiece.getStart()
+ (rangeStartBytes - tpStart) / encodingMultiplier;
@ -262,6 +266,10 @@ public class TextPieceTable implements CharIndexTranslator {
return result.toArray(new int[result.size()][]);
}
protected int getEncodingMultiplier(TextPiece textPiece) {
return textPiece.isUnicode() ? 2 : 1;
}
public int getCpMin() {
return _cpMin;
}
@ -439,7 +447,7 @@ public class TextPieceTable implements CharIndexTranslator {
return textPlex.toByteArray();
}
private static class FCComparator implements Comparator<TextPiece>, Serializable {
protected static class FCComparator implements Comparator<TextPiece>, Serializable {
public int compare(TextPiece textPiece, TextPiece textPiece1) {
if (textPiece.getPieceDescriptor().fc > textPiece1
.getPieceDescriptor().fc) {

View File

@ -18,6 +18,7 @@
package org.apache.poi.hwpf.usermodel;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.model.CHPX;
import org.apache.poi.hwpf.model.FFData;
import org.apache.poi.hwpf.model.Ffn;
@ -438,6 +439,10 @@ public final class CharacterRun extends Range
public String getFontName()
{
if (_doc instanceof HWPFOldDocument) {
return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii());
}
if (_doc.getFontTable() == null)
// old word format
return null;

View File

@ -16,18 +16,19 @@
==================================================================== */
package org.apache.poi.hwpf.converter;
import java.io.File;
import java.io.FilenameFilter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.junit.Assert.assertNotNull;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.FilenameFilter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.poi.POIDataSamples;
import org.apache.poi.hwpf.HWPFDocumentCore;
@ -36,8 +37,6 @@ import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import static org.junit.Assert.assertNotNull;
@RunWith(Parameterized.class)
public class TestWordToConverterSuite
{
@ -45,7 +44,11 @@ public class TestWordToConverterSuite
* YK: a quick hack to exclude failing documents from the suite.
*/
private static List<String> failingFiles = Arrays
.asList( "ProblemExtracting.doc" );
.asList( "ProblemExtracting.doc",
"Bug50955.doc" //basic extraction works,
// but these extractors modify the document,
// which is a no-go for this Word 6.0 file
);
@Parameterized.Parameters(name="{index}: {0}")
public static Iterable<Object[]> files() {

View File

@ -57,6 +57,7 @@ import junit.framework.TestCase;
* against HWPF
*/
public class TestBugs{
private static final POILogger logger = POILogFactory.getLogger(TestBugs.class);
public static void assertEqualsIgnoreNewline(String expected, String actual )
@ -536,13 +537,6 @@ public class TestBugs{
hwpfDocument.getPicturesTable().getAllPictures();
}
/**
* [FAILING] Bug 50955 - error while retrieving the text file
*/
@Test(expected=IllegalStateException.class)
public void test50955() throws IOException {
getTextOldFile("Bug50955.doc");
}
/**
* [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta

View File

@ -17,14 +17,19 @@
package org.apache.poi.hwpf.usermodel;
import static org.apache.poi.POITestCase.assertContains;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.nio.charset.Charset;
import org.apache.poi.OldFileFormatException;
import org.apache.poi.hwmf.record.HwmfFont;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.HWPFTestCase;
import org.apache.poi.hwpf.HWPFTestDataSamples;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.OldFontTable;
import org.junit.Test;
/**
@ -98,7 +103,7 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
// Normal, superscript for 4th, normal
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
doc.close();
}
@ -143,4 +148,87 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
doc.getRange().getParagraph(1).text());
doc.close();
}
@Test
public void testDefaultCodePageEncoding() throws IOException {
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
Word6Extractor ex = new Word6Extractor(doc);
String txt = ex.getText();
assertContains(txt, "BERTHOD");
assertContains(txt, "APPLICOLOR");
assertContains(txt, "les meilleurs");
assertContains(txt, "GUY LECOLE");
}
@Test
public void testCodePageBug50955() throws IOException {
//windows 1251
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
Word6Extractor ex = new Word6Extractor(doc);
StringBuilder sb = new StringBuilder();
for (String p : ex.getParagraphText()) {
sb.append(p);
}
assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
}
@Test
public void testCodePageBug60936() throws IOException {
//windows 1250 -- this test file was generated with OpenOffice
//see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
Word6Extractor ex = new Word6Extractor(doc);
StringBuilder sb = new StringBuilder();
for (String p : ex.getParagraphText()) {
sb.append(p);
}
assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings!
}
@Test
public void testOldFontTableEncoding() throws IOException {
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
OldFontTable oldFontTable = doc.getOldFontTable();
assertEquals(5, oldFontTable.getFontNames().length);
assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName());
assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5"));
assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName());
doc.close();
}
@Test
public void testOldFontTableAltName() throws IOException {
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
OldFontTable oldFontTable = doc.getOldFontTable();
assertEquals(5, oldFontTable.getFontNames().length);
assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName());
assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName());
assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
}
@Test
public void test51944() throws IOException {
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
Word6Extractor ex = new Word6Extractor(doc);
StringBuilder sb = new StringBuilder();
for (String p : ex.getParagraphText()) {
sb.append(p.replaceAll("[\r\n]+", "\n"));
}
String txt = sb.toString();
assertContains(txt, "Post and Fax");
assertContains(txt, "also maintain");//this is at a critical juncture
assertContains(txt, "which are available for");//this too
//TODO: figure out why these two aren't passing
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
// assertContains(txt, "We are able to");//not sure if we can get this easily?
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.