mirror of https://github.com/apache/poi.git
bug 50955 -- word 6.0 charset fix
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790061 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bd5ea8d3b7
commit
6fe3b75bfd
|
@ -218,6 +218,9 @@ public class TestAllFiles {
|
|||
"document/Word6_sections2.doc",
|
||||
"document/Word95.doc",
|
||||
"document/word95err.doc",
|
||||
"document/Bug60936.doc",
|
||||
"document/Bug60942.doc",
|
||||
"document/Bug60942b.doc",
|
||||
"hpsf/TestMickey.doc",
|
||||
"document/52117.doc"
|
||||
);
|
||||
|
|
|
@ -18,6 +18,9 @@
|
|||
package org.apache.poi.util;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Utilities for working with Microsoft CodePages.
|
||||
|
@ -27,6 +30,13 @@ import java.io.UnsupportedEncodingException;
|
|||
*/
|
||||
public class CodePageUtil
|
||||
{
|
||||
|
||||
public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>();
|
||||
static {
|
||||
//others?
|
||||
VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
|
||||
}
|
||||
|
||||
/** <p>Codepage 037, a special case</p> */
|
||||
public static final int CP_037 = 37;
|
||||
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.util;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
||||
/**
|
||||
* Stream that converts MSOffice's way of storing Big5, with
|
||||
* zero-byte padding for ASCII and in LittleEndianOrder.
|
||||
*/
|
||||
@Internal
|
||||
public class LittleEndianBig5Stream extends ByteArrayInputStream {
|
||||
private static final int EOF = -1;
|
||||
private static final int INVALID_PAIR = -2;
|
||||
private static final int EMPTY_TRAILING = -3;
|
||||
|
||||
//the char that is logically trailing in Big5 encoding
|
||||
//however in LittleEndian order, this is the first encountered.
|
||||
int trailing = EMPTY_TRAILING;
|
||||
public LittleEndianBig5Stream(byte[] buf) {
|
||||
super(buf);
|
||||
}
|
||||
|
||||
public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
|
||||
super(buf, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() {
|
||||
|
||||
if (trailing != EMPTY_TRAILING) {
|
||||
int tmp = trailing;
|
||||
trailing = EMPTY_TRAILING;
|
||||
return tmp;
|
||||
}
|
||||
int leading = readNext();
|
||||
while (leading == INVALID_PAIR) {
|
||||
leading = readNext();
|
||||
}
|
||||
|
||||
if (leading == EOF) {
|
||||
return EOF;
|
||||
}
|
||||
return leading;
|
||||
}
|
||||
|
||||
//returns leading, sets trailing appropriately
|
||||
//returns -1 if it hits the end of the stream
|
||||
//returns -2 for an invalid big5 code pair
|
||||
private final int readNext() {
|
||||
trailing = super.read();
|
||||
if (trailing == -1) {
|
||||
return EOF;
|
||||
}
|
||||
int leading = super.read();
|
||||
if (leading == EOF) {
|
||||
return EOF;
|
||||
}
|
||||
int lead = leading&0xff;
|
||||
if (lead > 0x80) {
|
||||
return leading;
|
||||
} else if (lead == 0) {
|
||||
int ret = trailing;
|
||||
trailing = EMPTY_TRAILING;
|
||||
return ret;
|
||||
} else {
|
||||
int ret = trailing;
|
||||
trailing = EMPTY_TRAILING;
|
||||
return ret;
|
||||
//return INVALID_PAIR;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] buff, int off, int len) {
|
||||
int bytesRead = 0;
|
||||
for (int i = off; i < off+len; i++) {
|
||||
int b = read();
|
||||
if (b == -1) {
|
||||
if (bytesRead == 0) {
|
||||
return -1;
|
||||
} else {
|
||||
return bytesRead;
|
||||
}
|
||||
}
|
||||
bytesRead++;
|
||||
buff[i] = (byte)b;
|
||||
}
|
||||
return bytesRead;
|
||||
}
|
||||
}
|
|
@ -17,6 +17,8 @@
|
|||
|
||||
package org.apache.poi.util;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
@ -27,9 +29,14 @@ import java.util.Map;
|
|||
*/
|
||||
@Internal
|
||||
public class StringUtil {
|
||||
|
||||
private static final POILogger logger = POILogFactory
|
||||
.getLogger(StringUtil.class);
|
||||
protected static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
|
||||
protected static final Charset UTF16LE = Charset.forName("UTF-16LE");
|
||||
public static final Charset UTF16LE = Charset.forName("UTF-16LE");
|
||||
public static final Charset UTF8 = Charset.forName("UTF-8");
|
||||
public static final Charset WIN_1252 = Charset.forName("cp1252");
|
||||
public static final Charset BIG5 = Charset.forName("Big5");
|
||||
|
||||
private static Map<Integer,Integer> msCodepointToUnicode;
|
||||
|
||||
|
@ -573,7 +580,28 @@ public class StringUtil {
|
|||
9133, // 0xf0fe bracerightbt
|
||||
' ', // 0xf0ff not defined
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* This tries to convert a LE byte array in Big5 to a String.
|
||||
* We know MS zero-padded ascii, and we drop those.
|
||||
* However, there may be areas for improvement in this.
|
||||
*
|
||||
* @param data
|
||||
* @param offset
|
||||
* @param lengthInBytes
|
||||
* @return
|
||||
*/
|
||||
public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
|
||||
ByteArrayOutputStream os = new ByteArrayOutputStream();
|
||||
try {
|
||||
IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
|
||||
} catch (IOException e) {
|
||||
logger.log(POILogger.WARN,
|
||||
"IOException while copying a byte array stream to a byte array stream?!");
|
||||
}
|
||||
return new String(os.toByteArray(), BIG5);
|
||||
}
|
||||
|
||||
// Could be replaced with org.apache.commons.lang3.StringUtils#join
|
||||
@Internal
|
||||
public static String join(Object[] array, String separator) {
|
||||
|
|
|
@ -108,7 +108,7 @@ public class HwmfFont {
|
|||
return charset;
|
||||
}
|
||||
|
||||
static WmfCharset valueOf(int flag) {
|
||||
public static WmfCharset valueOf(int flag) {
|
||||
for (WmfCharset cs : values()) {
|
||||
if (cs.flag == flag) return cs;
|
||||
}
|
||||
|
|
|
@ -19,27 +19,43 @@ package org.apache.poi.hwpf;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.hwmf.record.HwmfFont;
|
||||
import org.apache.poi.hwpf.model.ComplexFileTable;
|
||||
import org.apache.poi.hwpf.model.FontTable;
|
||||
import org.apache.poi.hwpf.model.OldCHPBinTable;
|
||||
import org.apache.poi.hwpf.model.OldComplexFileTable;
|
||||
import org.apache.poi.hwpf.model.OldFfn;
|
||||
import org.apache.poi.hwpf.model.OldFontTable;
|
||||
import org.apache.poi.hwpf.model.OldPAPBinTable;
|
||||
import org.apache.poi.hwpf.model.OldSectionTable;
|
||||
import org.apache.poi.hwpf.model.OldTextPieceTable;
|
||||
import org.apache.poi.hwpf.model.PieceDescriptor;
|
||||
import org.apache.poi.hwpf.model.TextPiece;
|
||||
import org.apache.poi.hwpf.model.TextPieceTable;
|
||||
import org.apache.poi.hwpf.usermodel.Range;
|
||||
import org.apache.poi.poifs.filesystem.DirectoryNode;
|
||||
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
|
||||
import org.apache.poi.util.CodePageUtil;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.NotImplemented;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Provides very simple support for old (Word 6 / Word 95)
|
||||
* files.
|
||||
*/
|
||||
public class HWPFOldDocument extends HWPFDocumentCore {
|
||||
private TextPieceTable tpt;
|
||||
|
||||
private final static Charset DEFAULT_CHARSET = StringUtil.WIN_1252;
|
||||
|
||||
private OldTextPieceTable tpt;
|
||||
|
||||
private StringBuilder _text;
|
||||
|
||||
private final OldFontTable fontTable;
|
||||
private final Charset guessedCharset;
|
||||
|
||||
public HWPFOldDocument(POIFSFileSystem fs) throws IOException {
|
||||
this(fs.getRoot());
|
||||
|
@ -56,45 +72,52 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||
int chpTableSize = LittleEndian.getInt(_mainStream, 0xbc);
|
||||
int papTableOffset = LittleEndian.getInt(_mainStream, 0xc0);
|
||||
int papTableSize = LittleEndian.getInt(_mainStream, 0xc4);
|
||||
//int shfTableOffset = LittleEndian.getInt(_mainStream, 0x60);
|
||||
//int shfTableSize = LittleEndian.getInt(_mainStream, 0x64);
|
||||
int fontTableOffset = LittleEndian.getInt(_mainStream, 0xd0);
|
||||
int fontTableSize = LittleEndian.getInt(_mainStream, 0xd4);
|
||||
|
||||
fontTable = new OldFontTable(_mainStream, fontTableOffset, fontTableSize);
|
||||
//TODO: figure out how to map runs/text pieces to fonts
|
||||
//for now, if there's a non standard codepage in one of the fonts
|
||||
//assume that the doc is in that codepage.
|
||||
guessedCharset = guessCodePage(fontTable);
|
||||
|
||||
int complexTableOffset = LittleEndian.getInt(_mainStream, 0x160);
|
||||
|
||||
// We need to get hold of the text that makes up the
|
||||
// document, which might be regular or fast-saved
|
||||
ComplexFileTable cft = null;
|
||||
StringBuffer text = new StringBuffer();
|
||||
if(_fib.getFibBase().isFComplex()) {
|
||||
cft = new ComplexFileTable(
|
||||
cft = new OldComplexFileTable(
|
||||
_mainStream, _mainStream,
|
||||
complexTableOffset, _fib.getFibBase().getFcMin()
|
||||
complexTableOffset, _fib.getFibBase().getFcMin(), guessedCharset
|
||||
);
|
||||
tpt = cft.getTextPieceTable();
|
||||
tpt = (OldTextPieceTable)cft.getTextPieceTable();
|
||||
|
||||
for(TextPiece tp : tpt.getTextPieces()) {
|
||||
text.append( tp.getStringBuilder() );
|
||||
}
|
||||
} else {
|
||||
// TODO Discover if these older documents can ever hold Unicode Strings?
|
||||
// (We think not, because they seem to lack a Piece table)
|
||||
// TODO Build the Piece Descriptor properly
|
||||
// (We have to fake it, as they don't seem to have a proper Piece table)
|
||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0);
|
||||
PieceDescriptor pd = new PieceDescriptor(new byte[] {0,0, 0,0,0,127, 0,0}, 0, guessedCharset);
|
||||
pd.setFilePosition(_fib.getFibBase().getFcMin());
|
||||
|
||||
// Generate a single Text Piece Table, with a single Text Piece
|
||||
// which covers all the (8 bit only) text in the file
|
||||
tpt = new TextPieceTable();
|
||||
tpt = new OldTextPieceTable();
|
||||
byte[] textData = new byte[_fib.getFibBase().getFcMac()-_fib.getFibBase().getFcMin()];
|
||||
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
|
||||
|
||||
int numChars = textData.length;
|
||||
if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
|
||||
numChars /= 2;
|
||||
}
|
||||
|
||||
TextPiece tp = new TextPiece(
|
||||
0, textData.length, textData, pd
|
||||
0, numChars, textData, pd
|
||||
);
|
||||
tpt.add(tp);
|
||||
|
||||
text.append(tp.getStringBuilder());
|
||||
}
|
||||
|
||||
_text = tpt.getText();
|
||||
|
||||
// Now we can fetch the character and paragraph properties
|
||||
|
@ -133,12 +156,54 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Take the first codepage that is not default, ansi or symbol.
|
||||
* Ideally, we'd want to track fonts with runs, but we don't yet
|
||||
* know how to do that.
|
||||
*
|
||||
* Consider throwing an exception if > 1 unique codepage that is not default, symbol or ansi
|
||||
* appears here.
|
||||
*
|
||||
* @param fontTable
|
||||
* @return
|
||||
*/
|
||||
private Charset guessCodePage(OldFontTable fontTable) {
|
||||
|
||||
for (OldFfn oldFfn : fontTable.getFontNames()) {
|
||||
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(oldFfn.getChs()& 0xff);
|
||||
if (wmfCharset != null &&
|
||||
wmfCharset != HwmfFont.WmfCharset.ANSI_CHARSET &&
|
||||
wmfCharset != HwmfFont.WmfCharset.DEFAULT_CHARSET &&
|
||||
wmfCharset != HwmfFont.WmfCharset.SYMBOL_CHARSET ) {
|
||||
return wmfCharset.getCharset();
|
||||
}
|
||||
}
|
||||
return DEFAULT_CHARSET;
|
||||
}
|
||||
|
||||
public Range getOverallRange()
|
||||
{
|
||||
// Life is easy when we have no footers, headers or unicode!
|
||||
return new Range( 0, _fib.getFibBase().getFcMac() - _fib.getFibBase().getFcMin(), this );
|
||||
}
|
||||
|
||||
/**
|
||||
* Use {@link #getOldFontTable()} instead!!!
|
||||
* This always throws an IllegalArgumentException.
|
||||
*
|
||||
* @return nothing
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
@Override
|
||||
@NotImplemented
|
||||
public FontTable getFontTable() {
|
||||
throw new UnsupportedOperationException("Use getOldFontTable instead.");
|
||||
}
|
||||
|
||||
public OldFontTable getOldFontTable() {
|
||||
return fontTable;
|
||||
}
|
||||
public Range getRange()
|
||||
{
|
||||
return getOverallRange();
|
||||
|
@ -167,4 +232,19 @@ public class HWPFOldDocument extends HWPFDocumentCore {
|
|||
public void write(OutputStream out) throws IOException {
|
||||
throw new IllegalStateException("Writing is not available for the older file formats");
|
||||
}
|
||||
|
||||
/**
|
||||
* As a rough heuristic (total hack), read through the font table
|
||||
* and take the first non-default, non-ansi, non-symbol
|
||||
* font's charset and return that.
|
||||
*
|
||||
* Once we figure out how to link a font to a text piece, we should
|
||||
* use the font information per text piece.
|
||||
*
|
||||
* @return charset
|
||||
*/
|
||||
public Charset getGuessedCharset() {
|
||||
return guessedCharset;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -26,9 +27,10 @@ import org.apache.poi.hwpf.model.io.HWPFOutputStream;
|
|||
import org.apache.poi.hwpf.sprm.SprmBuffer;
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
@Internal
|
||||
public final class ComplexFileTable {
|
||||
public class ComplexFileTable {
|
||||
private static final byte GRPPRL_TYPE = 1;
|
||||
private static final byte TEXT_PIECE_TABLE_TYPE = 2;
|
||||
|
||||
|
@ -40,7 +42,8 @@ public final class ComplexFileTable {
|
|||
_tpt = new TextPieceTable();
|
||||
}
|
||||
|
||||
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
|
||||
protected ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin,
|
||||
Charset charset) throws IOException {
|
||||
//skips through the prms before we reach the piece table. These contain data
|
||||
//for actual fast saved files
|
||||
List<SprmBuffer> sprmBuffers = new LinkedList<SprmBuffer>();
|
||||
|
@ -61,7 +64,12 @@ public final class ComplexFileTable {
|
|||
}
|
||||
int pieceTableSize = LittleEndian.getInt(tableStream, ++offset);
|
||||
offset += LittleEndian.INT_SIZE;
|
||||
_tpt = new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
|
||||
_tpt = newTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
|
||||
|
||||
}
|
||||
|
||||
public ComplexFileTable(byte[] documentStream, byte[] tableStream, int offset, int fcMin) throws IOException {
|
||||
this(documentStream, tableStream, offset, fcMin, StringUtil.WIN_1252);
|
||||
}
|
||||
|
||||
public TextPieceTable getTextPieceTable() {
|
||||
|
@ -92,4 +100,11 @@ public final class ComplexFileTable {
|
|||
tableStream.write(table);
|
||||
}
|
||||
|
||||
protected TextPieceTable newTextPieceTable(byte[] documentStream,
|
||||
byte[] tableStream, int offset, int pieceTableSize, int fcMin,
|
||||
Charset charset) {
|
||||
return new TextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ public final class OldCHPBinTable extends CHPBinTable
|
|||
* @param fcMin
|
||||
*/
|
||||
public OldCHPBinTable(byte[] documentStream, int offset,
|
||||
int size, int fcMin, TextPieceTable tpt)
|
||||
int size, int fcMin, OldTextPieceTable tpt)
|
||||
{
|
||||
PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
|
||||
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
|
||||
@Internal
|
||||
public final class OldComplexFileTable extends ComplexFileTable {
|
||||
|
||||
public OldComplexFileTable(byte[] documentStream, byte[] tableStream,
|
||||
int offset, int fcMin, Charset charset) throws IOException {
|
||||
super(documentStream, tableStream, offset, fcMin, charset);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected TextPieceTable newTextPieceTable(byte[] documentStream,
|
||||
byte[] tableStream, int offset,
|
||||
int pieceTableSize, int fcMin, Charset charset) {
|
||||
return new OldTextPieceTable(documentStream, tableStream, offset, pieceTableSize, fcMin, charset);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.hwmf.record.HwmfFont;
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Word 6.0 Font information
|
||||
*/
|
||||
@Internal
|
||||
public final class OldFfn {
|
||||
|
||||
private static final POILogger logger = POILogFactory.getLogger(OldFfn.class);
|
||||
|
||||
private byte _chs;// character set identifier
|
||||
|
||||
private final String fontName;
|
||||
private final String altFontName;
|
||||
|
||||
private final int length; //length in bytes for this record
|
||||
|
||||
/**
|
||||
* try to read an OldFfn starting at offset; read no farther than end
|
||||
*
|
||||
* @param buf buffer from which to read
|
||||
* @param offset offset at which to start
|
||||
* @param fontTableEnd read no farther than this
|
||||
* @return an OldFfn or null if asked to read beyond end
|
||||
*/
|
||||
static OldFfn build(byte[] buf, int offset, int fontTableEnd) {
|
||||
int start = offset;
|
||||
//preliminary bytes
|
||||
if (offset + 6 > fontTableEnd) {
|
||||
return null;
|
||||
}
|
||||
//first byte
|
||||
short fontDescriptionLength = (short) buf[offset];
|
||||
offset += 1;
|
||||
if (offset + fontDescriptionLength > fontTableEnd) {
|
||||
logger.log(POILogger.WARN, "Asked to read beyond font table end. Skipping font");
|
||||
return null;
|
||||
}
|
||||
|
||||
//no idea what these 3 bytes do
|
||||
offset += 3;
|
||||
byte chs = buf[offset];
|
||||
Charset charset = null;
|
||||
HwmfFont.WmfCharset wmfCharset = HwmfFont.WmfCharset.valueOf(chs & 0xff);
|
||||
if (wmfCharset == null) {
|
||||
logger.log(POILogger.WARN, "Couldn't find font for type: " + (chs & 0xff));
|
||||
} else {
|
||||
charset = wmfCharset.getCharset();
|
||||
}
|
||||
charset = charset == null ? StringUtil.WIN_1252 : charset;
|
||||
offset += LittleEndian.BYTE_SIZE;
|
||||
//if this byte here == 7, it _may_ signify existence of
|
||||
//an altername font name
|
||||
|
||||
//not sure what the byte after the _chs does
|
||||
offset += LittleEndian.BYTE_SIZE;
|
||||
int fontNameLength = -1;
|
||||
for (int i = offset; i < fontTableEnd; i++) {
|
||||
if (buf[i] == 0) {
|
||||
fontNameLength = i - offset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (fontNameLength == -1) {
|
||||
logger.log(POILogger.WARN, "Couldn't find the zero-byte delimited font name length");
|
||||
return null;
|
||||
}
|
||||
String fontName = new String(buf, offset, fontNameLength, charset);
|
||||
String altFontName = null;
|
||||
int altFontNameLength = -1;
|
||||
offset += fontNameLength + 1;
|
||||
if (offset - start < fontDescriptionLength) {
|
||||
for (int i = offset; i <= start + fontDescriptionLength; i++) {
|
||||
if (buf[i] == 0) {
|
||||
altFontNameLength = i - offset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (altFontNameLength > -1) {
|
||||
altFontName = new String(buf, offset, altFontNameLength, charset);
|
||||
}
|
||||
}
|
||||
//reset to 0 for length calculation
|
||||
altFontNameLength = (altFontNameLength < 0) ? 0 : altFontNameLength + 1;//add one for zero byte
|
||||
|
||||
int len = LittleEndian.INT_SIZE + LittleEndian.BYTE_SIZE + LittleEndian.BYTE_SIZE +//6 starting bytes
|
||||
fontNameLength + altFontNameLength + 1;//+1 is for the zero byte
|
||||
//this len should == fontDescriptionLength
|
||||
|
||||
return new OldFfn(chs, fontName, altFontName, len);
|
||||
|
||||
}
|
||||
|
||||
public OldFfn(byte charsetIdentifier, String fontName, String altFontName, int length) {
|
||||
this._chs = charsetIdentifier;
|
||||
this.fontName = fontName;
|
||||
this.altFontName = altFontName;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
public byte getChs() {
|
||||
return _chs;
|
||||
}
|
||||
|
||||
public String getMainFontName() {
|
||||
return fontName;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return altFontName if it exists, null otherwise
|
||||
*/
|
||||
public String getAltFontName() {
|
||||
return altFontName;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return length in bytes for this record
|
||||
*/
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "OldFfn{" +
|
||||
"_chs=" + (_chs & 0xff) +
|
||||
", fontName='" + fontName + '\'' +
|
||||
", altFontName='" + altFontName + '\'' +
|
||||
", length=" + length +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
/**
|
||||
* Font table for Word 6.0
|
||||
*/
|
||||
@Internal
|
||||
public final class OldFontTable {
|
||||
private final static POILogger _logger = POILogFactory.getLogger(OldFontTable.class);
|
||||
|
||||
// added extra facilitator members
|
||||
// FFN structure containing strings of font names
|
||||
private final OldFfn[] _fontNames;
|
||||
|
||||
public OldFontTable(byte[] buf, int offset, int length) {
|
||||
//length is stored at the index section in the table
|
||||
//and it is recorded in the first short.
|
||||
|
||||
|
||||
List<OldFfn> ffns = new ArrayList<OldFfn>();
|
||||
int fontTableLength = LittleEndian.getShort(buf, offset);
|
||||
|
||||
int endOfTableOffset = offset + length;
|
||||
int startOffset = offset + LittleEndian.SHORT_SIZE;//first short should == length!
|
||||
|
||||
while (true) {
|
||||
OldFfn oldFfn = OldFfn.build(buf, startOffset, endOfTableOffset);
|
||||
if (oldFfn == null) {
|
||||
break;
|
||||
}
|
||||
ffns.add(oldFfn);
|
||||
startOffset += oldFfn.getLength();
|
||||
|
||||
}
|
||||
_fontNames = ffns.toArray(new OldFfn[ffns.size()]);
|
||||
}
|
||||
|
||||
|
||||
public OldFfn[] getFontNames() {
|
||||
return _fontNames;
|
||||
}
|
||||
|
||||
|
||||
public String getMainFont(int chpFtc) {
|
||||
if (chpFtc >= _fontNames.length) {
|
||||
_logger.log(POILogger.INFO, "Mismatch in chpFtc with stringCount");
|
||||
return null;
|
||||
}
|
||||
|
||||
return _fontNames[chpFtc].getMainFontName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "OldFontTable{" +
|
||||
"_fontNames=" + Arrays.toString(_fontNames) +
|
||||
'}';
|
||||
}
|
||||
}
|
|
@ -0,0 +1,120 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.NotImplemented;
|
||||
|
||||
/**
|
||||
* Lightweight representation of a text piece.
|
||||
* Works in the character domain, not the byte domain, so you
|
||||
* need to have turned byte references into character
|
||||
* references before getting here.
|
||||
*/
|
||||
@Internal
|
||||
public class OldTextPiece extends TextPiece {
|
||||
|
||||
private final byte[] rawBytes;
|
||||
|
||||
/**
|
||||
* @param start Beginning offset in main document stream, in characters.
|
||||
* @param end Ending offset in main document stream, in characters.
|
||||
* @param text The raw bytes of our text
|
||||
*/
|
||||
public OldTextPiece(int start, int end, byte[] text, PieceDescriptor pd) {
|
||||
super(start, end, text, pd);
|
||||
this.rawBytes = text;
|
||||
if (end < start) {
|
||||
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return nothing, ever. Always throws an UnsupportedOperationException
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
@NotImplemented
|
||||
@Override
|
||||
public boolean isUnicode() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
public StringBuilder getStringBuilder() {
|
||||
return (StringBuilder) _buf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getRawBytes() {
|
||||
byte[] buf = new byte[rawBytes.length];
|
||||
System.arraycopy(rawBytes, 0, buf, 0, rawBytes.length);
|
||||
return buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns part of the string.
|
||||
* Works only in characters, not in bytes!
|
||||
*
|
||||
* @param start Local start position, in characters
|
||||
* @param end Local end position, in characters
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
@Deprecated
|
||||
@NotImplemented
|
||||
public String substring(int start, int end) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Not implemented for OldTextPiece.
|
||||
* Always throws UnsupportedOperationException
|
||||
*/
|
||||
@Deprecated
|
||||
@NotImplemented
|
||||
public void adjustForDelete(int start, int length) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length, in bytes
|
||||
*/
|
||||
public int bytesLength() {
|
||||
return rawBytes.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
assert false : "hashCode not designed";
|
||||
return 42; // any arbitrary constant will do
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the character position we start at.
|
||||
*/
|
||||
public int getCP() {
|
||||
return getStart();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "OldTextPiece from " + getStart() + " to " + getEnd() + " ("
|
||||
+ getPieceDescriptor() + ")";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
/* ====================================================================
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==================================================================== */
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.poi.util.CodePageUtil;
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.POILogFactory;
|
||||
import org.apache.poi.util.POILogger;
|
||||
|
||||
|
||||
@Internal
|
||||
public class OldTextPieceTable extends TextPieceTable {
|
||||
|
||||
private static final POILogger logger = POILogFactory
|
||||
.getLogger(OldTextPieceTable.class);
|
||||
|
||||
public OldTextPieceTable() {
|
||||
super();
|
||||
}
|
||||
|
||||
public OldTextPieceTable(byte[] documentStream, byte[] tableStream,
|
||||
int offset, int size, int fcMin, Charset charset) {
|
||||
//super(documentStream, tableStream, offset, size, fcMin, charset);
|
||||
// get our plex of PieceDescriptors
|
||||
PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size,
|
||||
PieceDescriptor.getSizeInBytes());
|
||||
|
||||
int length = pieceTable.length();
|
||||
PieceDescriptor[] pieces = new PieceDescriptor[length];
|
||||
|
||||
// iterate through piece descriptors raw bytes and create
|
||||
// PieceDescriptor objects
|
||||
for (int x = 0; x < length; x++) {
|
||||
GenericPropertyNode node = pieceTable.getProperty(x);
|
||||
pieces[x] = new PieceDescriptor(node.getBytes(), 0, charset);
|
||||
}
|
||||
|
||||
// Figure out the cp of the earliest text piece
|
||||
// Note that text pieces don't have to be stored in order!
|
||||
_cpMin = pieces[0].getFilePosition() - fcMin;
|
||||
for (PieceDescriptor piece : pieces) {
|
||||
int start = piece.getFilePosition() - fcMin;
|
||||
if (start < _cpMin) {
|
||||
_cpMin = start;
|
||||
}
|
||||
}
|
||||
|
||||
// using the PieceDescriptors, build our list of TextPieces.
|
||||
for (int x = 0; x < pieces.length; x++) {
|
||||
int start = pieces[x].getFilePosition();
|
||||
GenericPropertyNode node = pieceTable.getProperty(x);
|
||||
|
||||
// Grab the start and end, which are in characters
|
||||
int nodeStartChars = node.getStart();
|
||||
int nodeEndChars = node.getEnd();
|
||||
|
||||
// What's the relationship between bytes and characters?
|
||||
boolean unicode = pieces[x].isUnicode();
|
||||
int multiple = 1;
|
||||
if (unicode ||
|
||||
(charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
|
||||
multiple = 2;
|
||||
}
|
||||
|
||||
// Figure out the length, in bytes and chars
|
||||
int textSizeChars = (nodeEndChars - nodeStartChars);
|
||||
int textSizeBytes = textSizeChars * multiple;
|
||||
|
||||
// Grab the data that makes up the piece
|
||||
byte[] buf = new byte[textSizeBytes];
|
||||
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
|
||||
|
||||
// And now build the piece
|
||||
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
|
||||
pieces[x]);
|
||||
|
||||
_textPieces.add(newTextPiece);
|
||||
}
|
||||
|
||||
// In the interest of our sanity, now sort the text pieces
|
||||
// into order, if they're not already
|
||||
Collections.sort(_textPieces);
|
||||
_textPiecesFCOrder = new ArrayList<TextPiece>(_textPieces);
|
||||
Collections.sort(_textPiecesFCOrder, new FCComparator());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
|
||||
return new OldTextPiece(nodeStartChars, nodeEndChars, buf, pd);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int getEncodingMultiplier(TextPiece textPiece) {
|
||||
Charset charset = textPiece.getPieceDescriptor().getCharset();
|
||||
if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
|
@ -260,7 +260,7 @@ public class PAPBinTable
|
|||
SprmBuffer sprmBuffer = null;
|
||||
for ( PAPX papx : papxs )
|
||||
{
|
||||
if ( papx.getGrpprl() == null || papx.getGrpprl().length == 0 )
|
||||
if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 )
|
||||
continue;
|
||||
|
||||
if ( sprmBuffer == null ) {
|
||||
|
|
|
@ -17,10 +17,13 @@
|
|||
|
||||
package org.apache.poi.hwpf.model;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.util.BitField;
|
||||
import org.apache.poi.util.BitFieldFactory;
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.LittleEndian;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
@Internal
|
||||
public final class PieceDescriptor
|
||||
|
@ -32,29 +35,51 @@ public final class PieceDescriptor
|
|||
private static BitField fCopied = BitFieldFactory.getInstance(0x04);
|
||||
int fc;
|
||||
PropertyModifier prm;
|
||||
boolean unicode;
|
||||
boolean unicode = false;
|
||||
private final Charset charset;
|
||||
|
||||
|
||||
public PieceDescriptor(byte[] buf, int offset)
|
||||
{
|
||||
descriptor = LittleEndian.getShort(buf, offset);
|
||||
offset += LittleEndian.SHORT_SIZE;
|
||||
fc = LittleEndian.getInt(buf, offset);
|
||||
offset += LittleEndian.INT_SIZE;
|
||||
prm = new PropertyModifier( LittleEndian.getShort(buf, offset));
|
||||
|
||||
// see if this piece uses unicode.
|
||||
if ((fc & 0x40000000) == 0)
|
||||
{
|
||||
unicode = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
unicode = false;
|
||||
fc &= ~(0x40000000);//gives me FC in doc stream
|
||||
fc /= 2;
|
||||
public PieceDescriptor(byte[] buf, int offset) {
|
||||
this(buf, offset, null);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* This initializer should only be used for HWPFOldDocuments.
|
||||
*
|
||||
* @param buf
|
||||
* @param offset
|
||||
* @param charset which charset to use if this is not unicode
|
||||
*/
|
||||
public PieceDescriptor(byte[] buf, int offset, Charset charset) {
|
||||
descriptor = LittleEndian.getShort(buf, offset);
|
||||
offset += LittleEndian.SHORT_SIZE;
|
||||
fc = LittleEndian.getInt(buf, offset);
|
||||
offset += LittleEndian.INT_SIZE;
|
||||
prm = new PropertyModifier(LittleEndian.getShort(buf, offset));
|
||||
if (charset == null) {
|
||||
// see if this piece uses unicode.
|
||||
//From the documentation: If the second most significant bit
|
||||
//is clear, then this indicates the actual file offset of the Unicode character (two bytes). If the
|
||||
//second most significant bit is set, then the actual address of the codepage-1252
|
||||
//compressed version of the Unicode character (one byte), is actually at the offset indicated
|
||||
//by clearing this bit and dividing by two.
|
||||
if ((fc & 0x40000000) == 0) {
|
||||
unicode = true;
|
||||
this.charset = null;
|
||||
} else {
|
||||
unicode = false;
|
||||
fc &= ~(0x40000000);//gives me FC in doc stream
|
||||
fc /= 2;
|
||||
this.charset = StringUtil.WIN_1252;
|
||||
}
|
||||
} else {
|
||||
if (charset == StringUtil.UTF16LE) {
|
||||
unicode = true;
|
||||
}
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public int getFilePosition()
|
||||
|
@ -72,6 +97,15 @@ public final class PieceDescriptor
|
|||
return unicode;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return charset to use if this is not a Unicode PieceDescriptor
|
||||
* this can be <code>null</code>
|
||||
*/
|
||||
public Charset getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public PropertyModifier getPrm()
|
||||
{
|
||||
return prm;
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.poi.hwpf.model;
|
|||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.util.Internal;
|
||||
import org.apache.poi.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Lightweight representation of a text piece.
|
||||
|
@ -40,7 +41,6 @@ public class TextPiece extends PropertyNode<TextPiece> {
|
|||
* @param start Beginning offset in main document stream, in characters.
|
||||
* @param end Ending offset in main document stream, in characters.
|
||||
* @param text The raw bytes of our text
|
||||
* @deprecated Use {@link #TextPiece(int, int, byte[], PieceDescriptor)}
|
||||
* instead
|
||||
*/
|
||||
public TextPiece(int start, int end, byte[] text, PieceDescriptor pd,
|
||||
|
@ -72,8 +72,13 @@ public class TextPiece extends PropertyNode<TextPiece> {
|
|||
* Create the StringBuilder from the text and unicode flag
|
||||
*/
|
||||
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
|
||||
String str = new String(text, Charset.forName(pd.isUnicode() ? "UTF-16LE" : "Cp1252"));
|
||||
byte[] textBuffer = text;
|
||||
if (StringUtil.BIG5.equals(pd.getCharset())) {
|
||||
String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString();
|
||||
return new StringBuilder(txt);
|
||||
}
|
||||
|
||||
String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());
|
||||
return new StringBuilder(str);
|
||||
}
|
||||
|
||||
|
@ -207,4 +212,5 @@ public class TextPiece extends PropertyNode<TextPiece> {
|
|||
return "TextPiece from " + getStart() + " to " + getEnd() + " ("
|
||||
+ getPieceDescriptor() + ")";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -101,7 +101,7 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||
System.arraycopy(documentStream, start, buf, 0, textSizeBytes);
|
||||
|
||||
// And now build the piece
|
||||
final TextPiece newTextPiece = new TextPiece(nodeStartChars, nodeEndChars, buf,
|
||||
final TextPiece newTextPiece = newTextPiece(nodeStartChars, nodeEndChars, buf,
|
||||
pieces[x]);
|
||||
|
||||
_textPieces.add(newTextPiece);
|
||||
|
@ -114,6 +114,10 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||
Collections.sort(_textPiecesFCOrder, new FCComparator());
|
||||
}
|
||||
|
||||
protected TextPiece newTextPiece(int nodeStartChars, int nodeEndChars, byte[] buf, PieceDescriptor pd) {
|
||||
return new TextPiece(nodeStartChars, nodeEndChars, buf, pd);
|
||||
}
|
||||
|
||||
public void add(TextPiece piece) {
|
||||
_textPieces.add(piece);
|
||||
_textPiecesFCOrder.add(piece);
|
||||
|
@ -249,7 +253,7 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||
if (rangeStartBytes > rangeEndBytes)
|
||||
continue;
|
||||
|
||||
final int encodingMultiplier = textPiece.isUnicode() ? 2 : 1;
|
||||
final int encodingMultiplier = getEncodingMultiplier(textPiece);
|
||||
|
||||
final int rangeStartCp = textPiece.getStart()
|
||||
+ (rangeStartBytes - tpStart) / encodingMultiplier;
|
||||
|
@ -262,6 +266,10 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||
return result.toArray(new int[result.size()][]);
|
||||
}
|
||||
|
||||
protected int getEncodingMultiplier(TextPiece textPiece) {
|
||||
return textPiece.isUnicode() ? 2 : 1;
|
||||
}
|
||||
|
||||
public int getCpMin() {
|
||||
return _cpMin;
|
||||
}
|
||||
|
@ -439,7 +447,7 @@ public class TextPieceTable implements CharIndexTranslator {
|
|||
return textPlex.toByteArray();
|
||||
}
|
||||
|
||||
private static class FCComparator implements Comparator<TextPiece>, Serializable {
|
||||
protected static class FCComparator implements Comparator<TextPiece>, Serializable {
|
||||
public int compare(TextPiece textPiece, TextPiece textPiece1) {
|
||||
if (textPiece.getPieceDescriptor().fc > textPiece1
|
||||
.getPieceDescriptor().fc) {
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||
import org.apache.poi.hwpf.model.CHPX;
|
||||
import org.apache.poi.hwpf.model.FFData;
|
||||
import org.apache.poi.hwpf.model.Ffn;
|
||||
|
@ -438,6 +439,10 @@ public final class CharacterRun extends Range
|
|||
|
||||
public String getFontName()
|
||||
{
|
||||
if (_doc instanceof HWPFOldDocument) {
|
||||
return ((HWPFOldDocument) _doc).getOldFontTable().getMainFont(_props.getFtcAscii());
|
||||
}
|
||||
|
||||
if (_doc.getFontTable() == null)
|
||||
// old word format
|
||||
return null;
|
||||
|
|
|
@ -16,18 +16,19 @@
|
|||
==================================================================== */
|
||||
package org.apache.poi.hwpf.converter;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.File;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.poi.POIDataSamples;
|
||||
import org.apache.poi.hwpf.HWPFDocumentCore;
|
||||
|
@ -36,8 +37,6 @@ import org.junit.Test;
|
|||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.Parameterized;
|
||||
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
|
||||
@RunWith(Parameterized.class)
|
||||
public class TestWordToConverterSuite
|
||||
{
|
||||
|
@ -45,7 +44,11 @@ public class TestWordToConverterSuite
|
|||
* YK: a quick hack to exclude failing documents from the suite.
|
||||
*/
|
||||
private static List<String> failingFiles = Arrays
|
||||
.asList( "ProblemExtracting.doc" );
|
||||
.asList( "ProblemExtracting.doc",
|
||||
"Bug50955.doc" //basic extraction works,
|
||||
// but these extractors modify the document,
|
||||
// which is a no-go for this Word 6.0 file
|
||||
);
|
||||
|
||||
@Parameterized.Parameters(name="{index}: {0}")
|
||||
public static Iterable<Object[]> files() {
|
||||
|
|
|
@ -57,6 +57,7 @@ import junit.framework.TestCase;
|
|||
* against HWPF
|
||||
*/
|
||||
public class TestBugs{
|
||||
|
||||
private static final POILogger logger = POILogFactory.getLogger(TestBugs.class);
|
||||
|
||||
public static void assertEqualsIgnoreNewline(String expected, String actual )
|
||||
|
@ -536,13 +537,6 @@ public class TestBugs{
|
|||
hwpfDocument.getPicturesTable().getAllPictures();
|
||||
}
|
||||
|
||||
/**
|
||||
* [FAILING] Bug 50955 - error while retrieving the text file
|
||||
*/
|
||||
@Test(expected=IllegalStateException.class)
|
||||
public void test50955() throws IOException {
|
||||
getTextOldFile("Bug50955.doc");
|
||||
}
|
||||
|
||||
/**
|
||||
* [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
|
||||
|
|
|
@ -17,14 +17,19 @@
|
|||
|
||||
package org.apache.poi.hwpf.usermodel;
|
||||
|
||||
import static org.apache.poi.POITestCase.assertContains;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.poi.OldFileFormatException;
|
||||
import org.apache.poi.hwmf.record.HwmfFont;
|
||||
import org.apache.poi.hwpf.HWPFOldDocument;
|
||||
import org.apache.poi.hwpf.HWPFTestCase;
|
||||
import org.apache.poi.hwpf.HWPFTestDataSamples;
|
||||
import org.apache.poi.hwpf.extractor.Word6Extractor;
|
||||
import org.apache.poi.hwpf.model.OldFontTable;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
|
@ -98,7 +103,7 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
|
|||
assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
|
||||
// Normal, superscript for 4th, normal
|
||||
assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
|
||||
|
||||
|
||||
doc.close();
|
||||
}
|
||||
|
||||
|
@ -143,4 +148,87 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
|
|||
doc.getRange().getParagraph(1).text());
|
||||
doc.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDefaultCodePageEncoding() throws IOException {
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
|
||||
Word6Extractor ex = new Word6Extractor(doc);
|
||||
String txt = ex.getText();
|
||||
assertContains(txt, "BERTHOD");
|
||||
assertContains(txt, "APPLICOLOR");
|
||||
assertContains(txt, "les meilleurs");
|
||||
assertContains(txt, "GUY LECOLE");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCodePageBug50955() throws IOException {
|
||||
//windows 1251
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
|
||||
Word6Extractor ex = new Word6Extractor(doc);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String p : ex.getParagraphText()) {
|
||||
sb.append(p);
|
||||
}
|
||||
assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCodePageBug60936() throws IOException {
|
||||
//windows 1250 -- this test file was generated with OpenOffice
|
||||
//see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration
|
||||
|
||||
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
|
||||
Word6Extractor ex = new Word6Extractor(doc);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String p : ex.getParagraphText()) {
|
||||
sb.append(p);
|
||||
}
|
||||
assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings!
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOldFontTableEncoding() throws IOException {
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
|
||||
OldFontTable oldFontTable = doc.getOldFontTable();
|
||||
assertEquals(5, oldFontTable.getFontNames().length);
|
||||
assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName());
|
||||
assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5"));
|
||||
assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName());
|
||||
doc.close();
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOldFontTableAltName() throws IOException {
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
|
||||
OldFontTable oldFontTable = doc.getOldFontTable();
|
||||
assertEquals(5, oldFontTable.getFontNames().length);
|
||||
assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName());
|
||||
assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
|
||||
assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName());
|
||||
assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void test51944() throws IOException {
|
||||
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
|
||||
Word6Extractor ex = new Word6Extractor(doc);
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String p : ex.getParagraphText()) {
|
||||
sb.append(p.replaceAll("[\r\n]+", "\n"));
|
||||
}
|
||||
String txt = sb.toString();
|
||||
assertContains(txt, "Post and Fax");
|
||||
assertContains(txt, "also maintain");//this is at a critical juncture
|
||||
assertContains(txt, "which are available for");//this too
|
||||
|
||||
//TODO: figure out why these two aren't passing
|
||||
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
|
||||
// assertContains(txt, "We are able to");//not sure if we can get this easily?
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue