bug 50955 and bug 60953 improve Big5 reader; ensure one character

per byte pair

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1790172 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tim Allison 2017-04-05 01:45:55 +00:00
parent 0f9335a575
commit 55ed734108
11 changed files with 593 additions and 158 deletions

View File

@ -31,10 +31,12 @@ import java.util.Set;
public class CodePageUtil
{
public static final Set<Charset> VARIABLE_BYTE_CHARSETS = new HashSet<Charset>();
public static final Set<Charset> DOUBLE_BYTE_CHARSETS
= new HashSet<Charset>();
static {
DOUBLE_BYTE_CHARSETS.add(StringUtil.BIG5);
//others?
VARIABLE_BYTE_CHARSETS.add(StringUtil.BIG5);
}
/** <p>Codepage 037, a special case</p> */
@ -450,4 +452,26 @@ public class CodePageUtil
return "cp" + codepage;
}
}
/**
* This tries to convert a LE byte array in cp950
* (Microsoft's dialect of Big5) to a String.
* We know MS zero-padded ascii, and we drop those.
* There may be areas for improvement in this.
*
* @param data
* @param offset
* @param lengthInBytes
* @return
*/
public static String cp950ToString(byte[] data, int offset, int lengthInBytes) {
StringBuilder sb = new StringBuilder();
LittleEndianCP950Reader reader = new LittleEndianCP950Reader(data, offset, lengthInBytes);
int c = reader.read();
while (c != -1) {
sb.append((char)c);
c = reader.read();
}
return sb.toString();
}
}

View File

@ -1,107 +0,0 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.util;
import java.io.ByteArrayInputStream;
/**
* Stream that converts MSOffice's way of storing Big5, with
* zero-byte padding for ASCII and in LittleEndianOrder.
*/
@Internal
public class LittleEndianBig5Stream extends ByteArrayInputStream {
private static final int EOF = -1;
private static final int INVALID_PAIR = -2;
private static final int EMPTY_TRAILING = -3;
//the char that is logically trailing in Big5 encoding
//however in LittleEndian order, this is the first encountered.
int trailing = EMPTY_TRAILING;
public LittleEndianBig5Stream(byte[] buf) {
super(buf);
}
public LittleEndianBig5Stream(byte[] buf, int offset, int length) {
super(buf, offset, length);
}
@Override
public int read() {
if (trailing != EMPTY_TRAILING) {
int tmp = trailing;
trailing = EMPTY_TRAILING;
return tmp;
}
int leading = readNext();
while (leading == INVALID_PAIR) {
leading = readNext();
}
if (leading == EOF) {
return EOF;
}
return leading;
}
//returns leading, sets trailing appropriately
//returns -1 if it hits the end of the stream
//returns -2 for an invalid big5 code pair
private final int readNext() {
trailing = super.read();
if (trailing == -1) {
return EOF;
}
int leading = super.read();
if (leading == EOF) {
return EOF;
}
int lead = leading&0xff;
if (lead > 0x80) {
return leading;
} else if (lead == 0) {
int ret = trailing;
trailing = EMPTY_TRAILING;
return ret;
} else {
int ret = trailing;
trailing = EMPTY_TRAILING;
return ret;
//return INVALID_PAIR;
}
}
@Override
public int read(byte[] buff, int off, int len) {
int bytesRead = 0;
for (int i = off; i < off+len; i++) {
int b = read();
if (b == -1) {
if (bytesRead == 0) {
return -1;
} else {
return bytesRead;
}
}
bytesRead++;
buff[i] = (byte)b;
}
return bytesRead;
}
}

View File

@ -0,0 +1,480 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.util;
import java.io.IOException;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
/**
* Stream that converts CP950 (MSOffice's dialect of Big5), with
* zero-byte padding for ASCII and in LittleEndianOrder.
*/
@Internal
public class LittleEndianCP950Reader extends Reader {
private static final POILogger LOGGER = POILogFactory.getLogger(LittleEndianCP950Reader.class);
private static final char UNMAPPABLE = (char) '?';
private final ByteBuffer doubleByteBuffer = ByteBuffer.allocate(2);
private final CharBuffer charBuffer = CharBuffer.allocate(2);
private final CharsetDecoder decoder = StringUtil.BIG5.newDecoder();
//https://en.wikipedia.org/wiki/Code_page_950
//see private use area
private final static char range1Low = '\u8140';
private final static char range1High = '\u8DFE';
private final static char range2Low = '\u8E40';
private final static char range2High = '\uA0FE';
private final static char range3Low = '\uC6A1';
private final static char range3High = '\uC8FE';
private final static char range4Low = '\uFA40';
private final static char range4High = '\uFEFE';
private final byte[] data;
private final int startOffset;
private final int length;
private int offset;
private int trailing;
private int leading;
int cnt = 0;
//the char that is logically trailing in Big5 encoding
//however in LittleEndian order, this is the first encountered.
public LittleEndianCP950Reader(byte[] data) {
this(data, 0, data.length);
}
public LittleEndianCP950Reader(byte[] data, int offset, int length) {
this.data = data;
this.startOffset = offset;
this.offset = startOffset;
this.length = length;
}
@Override
public int read() {
if (offset + 1 > data.length || offset - startOffset > length) {
return -1;
}
trailing = data[offset++] & 0xff;
leading = data[offset++] & 0xff;
decoder.reset();
if (leading < 0x81) {
//return trailing alone
//there may be some subtleties here
return trailing;
} else if (leading == 0xf9) {
return handleF9(trailing);
} else {
int ch = (leading << 8) + trailing;
if (ch >= range1Low && ch <= range1High) {
return handleRange1(leading, trailing);
} else if (ch >= range2Low && ch <= range2High) {
return handleRange2(leading, trailing);
} else if (ch >= range3Low && ch <= range3High) {
return handleRange3(leading, trailing);
} else if (ch >= range4Low && ch <= range4High) {
return handleRange4(leading, trailing);
}
charBuffer.clear();
doubleByteBuffer.clear();
doubleByteBuffer.put((byte) leading);
doubleByteBuffer.put((byte) trailing);
doubleByteBuffer.flip();
decoder.decode(doubleByteBuffer, charBuffer, true);
charBuffer.flip();
if (charBuffer.length() == 0) {
LOGGER.log(POILogger.WARN, "couldn't create char for: "
+ Integer.toString((leading & 0xff), 16)
+ " " + Integer.toString((trailing & 0xff), 16));
return UNMAPPABLE;
} else {
return Character.codePointAt(charBuffer, 0);
}
}
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
//there may be some efficiencies, but this should do for now.
for (int i = off; i < off + len; i++) {
int c = read();
if (c == -1) {
return i - off;
}
cbuf[i] = (char) c;
}
return len;
}
@Override
public void close() throws IOException {
}
private int handleRange1(int leading, int trailing) {
return (0xeeb8 + (157 * (leading - 0x81))) +
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
}
private int handleRange2(int leading, int trailing) {
return (0xe311 + (157 * (leading - 0x8e))) +
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
}
private int handleRange3(int leading, int trailing) {
return (0xf672 + (157 * (leading - 0xc6))) +
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
}
private int handleRange4(int leading, int trailing) {
return (0xe000 + (157 * (leading - 0xfa))) +
((trailing < 0x80) ? trailing - 0x40 : trailing - 0x62);
}
private int handleF9(int trailing) {
switch (trailing) {
case 0x40:
return 0x7e98;
case 0x41:
return 0x7e9b;
case 0x42:
return 0x7e99;
case 0x43:
return 0x81e0;
case 0x44:
return 0x81e1;
case 0x45:
return 0x8646;
case 0x46:
return 0x8647;
case 0x47:
return 0x8648;
case 0x48:
return 0x8979;
case 0x49:
return 0x897a;
case 0x4a:
return 0x897c;
case 0x4b:
return 0x897b;
case 0x4c:
return 0x89ff;
case 0x4d:
return 0x8b98;
case 0x4e:
return 0x8b99;
case 0x4f:
return 0x8ea5;
case 0x50:
return 0x8ea4;
case 0x51:
return 0x8ea3;
case 0x52:
return 0x946e;
case 0x53:
return 0x946d;
case 0x54:
return 0x946f;
case 0x55:
return 0x9471;
case 0x56:
return 0x9473;
case 0x57:
return 0x9749;
case 0x58:
return 0x9872;
case 0x59:
return 0x995f;
case 0x5a:
return 0x9c68;
case 0x5b:
return 0x9c6e;
case 0x5c:
return 0x9c6d;
case 0x5d:
return 0x9e0b;
case 0x5e:
return 0x9e0d;
case 0x5f:
return 0x9e10;
case 0x60:
return 0x9e0f;
case 0x61:
return 0x9e12;
case 0x62:
return 0x9e11;
case 0x63:
return 0x9ea1;
case 0x64:
return 0x9ef5;
case 0x65:
return 0x9f09;
case 0x66:
return 0x9f47;
case 0x67:
return 0x9f78;
case 0x68:
return 0x9f7b;
case 0x69:
return 0x9f7a;
case 0x6a:
return 0x9f79;
case 0x6b:
return 0x571e;
case 0x6c:
return 0x7066;
case 0x6d:
return 0x7c6f;
case 0x6e:
return 0x883c;
case 0x6f:
return 0x8db2;
case 0x70:
return 0x8ea6;
case 0x71:
return 0x91c3;
case 0x72:
return 0x9474;
case 0x73:
return 0x9478;
case 0x74:
return 0x9476;
case 0x75:
return 0x9475;
case 0x76:
return 0x9a60;
case 0x77:
return 0x9c74;
case 0x78:
return 0x9c73;
case 0x79:
return 0x9c71;
case 0x7a:
return 0x9c75;
case 0x7b:
return 0x9e14;
case 0x7c:
return 0x9e13;
case 0x7d:
return 0x9ef6;
case 0x7e:
return 0x9f0a;
case 0xa1:
return 0x9fa4;
case 0xa2:
return 0x7068;
case 0xa3:
return 0x7065;
case 0xa4:
return 0x7cf7;
case 0xa5:
return 0x866a;
case 0xa6:
return 0x883e;
case 0xa7:
return 0x883d;
case 0xa8:
return 0x883f;
case 0xa9:
return 0x8b9e;
case 0xaa:
return 0x8c9c;
case 0xab:
return 0x8ea9;
case 0xac:
return 0x8ec9;
case 0xad:
return 0x974b;
case 0xae:
return 0x9873;
case 0xaf:
return 0x9874;
case 0xb0:
return 0x98cc;
case 0xb1:
return 0x9961;
case 0xb2:
return 0x99ab;
case 0xb3:
return 0x9a64;
case 0xb4:
return 0x9a66;
case 0xb5:
return 0x9a67;
case 0xb6:
return 0x9b24;
case 0xb7:
return 0x9e15;
case 0xb8:
return 0x9e17;
case 0xb9:
return 0x9f48;
case 0xba:
return 0x6207;
case 0xbb:
return 0x6b1e;
case 0xbc:
return 0x7227;
case 0xbd:
return 0x864c;
case 0xbe:
return 0x8ea8;
case 0xbf:
return 0x9482;
case 0xc0:
return 0x9480;
case 0xc1:
return 0x9481;
case 0xc2:
return 0x9a69;
case 0xc3:
return 0x9a68;
case 0xc4:
return 0x9b2e;
case 0xc5:
return 0x9e19;
case 0xc6:
return 0x7229;
case 0xc7:
return 0x864b;
case 0xc8:
return 0x8b9f;
case 0xc9:
return 0x9483;
case 0xca:
return 0x9c79;
case 0xcb:
return 0x9eb7;
case 0xcc:
return 0x7675;
case 0xcd:
return 0x9a6b;
case 0xce:
return 0x9c7a;
case 0xcf:
return 0x9e1d;
case 0xd0:
return 0x7069;
case 0xd1:
return 0x706a;
case 0xd2:
return 0x9ea4;
case 0xd3:
return 0x9f7e;
case 0xd4:
return 0x9f49;
case 0xd5:
return 0x9f98;
case 0xd6:
return 0x7881;
case 0xd7:
return 0x92b9;
case 0xd8:
return 0x88cf;
case 0xd9:
return 0x58bb;
case 0xda:
return 0x6052;
case 0xdb:
return 0x7ca7;
case 0xdc:
return 0x5afa;
case 0xdd:
return 0x2554;
case 0xde:
return 0x2566;
case 0xdf:
return 0x2557;
case 0xe0:
return 0x2560;
case 0xe1:
return 0x256c;
case 0xe2:
return 0x2563;
case 0xe3:
return 0x255a;
case 0xe4:
return 0x2569;
case 0xe5:
return 0x255d;
case 0xe6:
return 0x2552;
case 0xe7:
return 0x2564;
case 0xe8:
return 0x2555;
case 0xe9:
return 0x255e;
case 0xea:
return 0x256a;
case 0xeb:
return 0x2561;
case 0xec:
return 0x2558;
case 0xed:
return 0x2567;
case 0xee:
return 0x255b;
case 0xef:
return 0x2553;
case 0xf0:
return 0x2565;
case 0xf1:
return 0x2556;
case 0xf2:
return 0x255f;
case 0xf3:
return 0x256b;
case 0xf4:
return 0x2562;
case 0xf5:
return 0x2559;
case 0xf6:
return 0x2568;
case 0xf7:
return 0x255c;
case 0xf8:
return 0x2551;
case 0xf9:
return 0x2550;
case 0xfa:
return 0x256d;
case 0xfb:
return 0x256e;
case 0xfc:
return 0x2570;
case 0xfd:
return 0x256f;
case 0xfe:
return 0x2593;
default:
LOGGER.log(POILogger.WARN, "couldn't create char for: f9"
+ " " + Integer.toString((trailing & 0xff), 16));
return UNMAPPABLE;
}
}
}

View File

@ -17,8 +17,6 @@
package org.apache.poi.util;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
@ -581,26 +579,6 @@ public class StringUtil {
' ', // 0xf0ff not defined
};
/**
* This tries to convert a LE byte array in Big5 to a String.
* We know MS zero-padded ascii, and we drop those.
* However, there may be areas for improvement in this.
*
* @param data
* @param offset
* @param lengthInBytes
* @return
*/
public static String littleEndianBig5Stream(byte[] data, int offset, int lengthInBytes) {
ByteArrayOutputStream os = new ByteArrayOutputStream();
try {
IOUtils.copy(new LittleEndianBig5Stream(data, offset, lengthInBytes), os);
} catch (IOException e) {
logger.log(POILogger.WARN,
"IOException while copying a byte array stream to a byte array stream?!");
}
return new String(os.toByteArray(), BIG5);
}
// Could be replaced with org.apache.commons.lang3.StringUtils#join
@Internal

View File

@ -108,7 +108,7 @@ public class HWPFOldDocument extends HWPFDocumentCore {
System.arraycopy(_mainStream, _fib.getFibBase().getFcMin(), textData, 0, textData.length);
int numChars = textData.length;
if (CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(guessedCharset)) {
if (CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(guessedCharset)) {
numChars /= 2;
}

View File

@ -18,7 +18,6 @@
package org.apache.poi.hwpf.model;
import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.Internal;
import org.apache.poi.util.NotImplemented;
@ -43,17 +42,6 @@ public class OldTextPiece extends TextPiece {
this.rawBytes = text;
}
@Override
protected void validateLengths(int start, int end, int length, PieceDescriptor pd) {
//things are still wonky with Big5 char/byte length mapping
//sometimes working w/ Java 8 but not w/ Java 7!
//for now, if we're dealing w/ Big5 don't bother checking
if (pd.getCharset() != null &&
CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(pd.getCharset())) {
return;
}
super.validateLengths(start, end, length, pd);
}
/**
* @return nothing, ever. Always throws an UnsupportedOperationException
* @throws UnsupportedOperationException

View File

@ -76,7 +76,7 @@ public class OldTextPieceTable extends TextPieceTable {
boolean unicode = pieces[x].isUnicode();
int multiple = 1;
if (unicode ||
(charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset))) {
(charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset))) {
multiple = 2;
}
@ -111,7 +111,7 @@ public class OldTextPieceTable extends TextPieceTable {
@Override
protected int getEncodingMultiplier(TextPiece textPiece) {
Charset charset = textPiece.getPieceDescriptor().getCharset();
if (charset != null && CodePageUtil.VARIABLE_BYTE_CHARSETS.contains(charset)) {
if (charset != null && CodePageUtil.DOUBLE_BYTE_CHARSETS.contains(charset)) {
return 2;
}
return 1;

View File

@ -20,6 +20,7 @@ package org.apache.poi.hwpf.model;
import java.nio.charset.Charset;
import org.apache.poi.util.CodePageUtil;
import org.apache.poi.util.Internal;
import org.apache.poi.util.StringUtil;
@ -60,25 +61,21 @@ public class TextPiece extends PropertyNode<TextPiece> {
// Validate
int textLength = ((CharSequence) _buf).length();
validateLengths(start, end, textLength, pd);
if (end - start != textLength) {
throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
}
if (end < start) {
throw new IllegalStateException("Told we're of negative size! start=" + start + " end=" + end);
}
}
protected void validateLengths(int start, int end, int textLength, PieceDescriptor pd) {
if (end - start != textLength) {
throw new IllegalStateException("Told we're for characters " + start + " -> " + end + ", but actually covers " + textLength + " characters!");
}
}
/**
* Create the StringBuilder from the text and unicode flag
*/
private static StringBuilder buildInitSB(byte[] text, PieceDescriptor pd) {
byte[] textBuffer = text;
if (StringUtil.BIG5.equals(pd.getCharset())) {
String txt = new StringBuilder(StringUtil.littleEndianBig5Stream(text, 0, text.length)).toString();
return new StringBuilder(txt);
return new StringBuilder(CodePageUtil.cp950ToString(text, 0, text.length));
}
String str = new String(textBuffer, 0, textBuffer.length, (pd.isUnicode()) ? StringUtil.UTF16LE : pd.getCharset());

View File

@ -49,7 +49,6 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
import org.junit.Ignore;
import org.junit.Test;
/**
@ -729,7 +728,6 @@ public class TestBugs{
* Bug 51944 - PAPFormattedDiskPage.getPAPX - IndexOutOfBounds
*/
@Test
@Ignore("Test now passes in Java 1.7 and 1.8, but not 1.6")
public void testBug51944() throws Exception
{
HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");

View File

@ -247,8 +247,8 @@ public final class TestHWPFOldDocument extends HWPFTestCase {
*/
assertContains(txt, "\n9-55 xxxxx block5");
//TODO: figure out why these two aren't passing
// assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
// assertContains(txt, "We are able to");//not sure if we can get this easily?
//assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
//assertContains(txt, "We are able to");//not sure if we can get this easily?
}
}

View File

@ -0,0 +1,77 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.util;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.io.Reader;
import org.junit.Test;
public class TestLittleEndianCP950Reader {
@Test
public void testPersonalUseMappings() throws Exception {
//ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WindowsBestFit/bestfit950.txt
byte[] data = new byte[2];
data[1] = (byte) 0xfe;
data[0] = (byte) 0xd3;
assertCharEquals('\uE2E5', data);
data[1] = (byte) 0x90;
data[0] = (byte) 0xb6;
assertCharEquals('\uE49F', data);
//actually found in document
//but this disagrees with file above
data[1] = (byte) 0x8E;
data[0] = (byte) 0xA8;
assertCharEquals('\uE357', data);
data[1] = (byte) 0x8E;
data[0] = (byte) 0xE6;
assertCharEquals('\uE395', data);
/*
//TODO: figure out why this isn't working
data[0] = (byte)0xF9;
data[1] = (byte)0xD8;
assertCharEquals('\u88CF', data);
*/
}
@Test
public void one() {
byte b = (byte) 0xfe;
byte c = (byte) 0xd3;
int i = ((b & 0xff) << 8) + (c & 0xff);
System.out.println(i);
}
private void assertCharEquals(char expected, byte[] data) throws IOException {
Reader reader = new LittleEndianCP950Reader(data);
int c = reader.read();
assertEquals((int) expected, c);
int eof = reader.read();
assertEquals("should be end of stream", -1, eof);
}
}