mirror of https://github.com/apache/poi.git
Bug 63323 -- improve handling of multibyte characters
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1857135 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
836b9abdc6
commit
ad11a5d363
|
@ -400,7 +400,11 @@ public class HwmfGraphics {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
String textString = new String(text, charset).substring(0,length).trim();
|
String textString = "";
|
||||||
|
if (text != null) {
|
||||||
|
textString = new String(text, charset).trim();
|
||||||
|
textString = textString.substring(0, Math.min(textString.length(), length));
|
||||||
|
}
|
||||||
|
|
||||||
if (textString.isEmpty()) {
|
if (textString.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -395,7 +395,12 @@ public class HwmfText {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getText(Charset charset) throws IOException {
|
public String getText(Charset charset) throws IOException {
|
||||||
return new String(rawTextBytes, charset).substring(0, stringLength);
|
if (rawTextBytes == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
String ret = new String(rawTextBytes, charset);
|
||||||
|
return ret.substring(0,
|
||||||
|
Math.min(ret.length(), stringLength));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Point2D getReference() {
|
public Point2D getReference() {
|
||||||
|
|
|
@ -35,6 +35,7 @@ import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
|
@ -238,12 +239,12 @@ public class TestHwmfParsing {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore("If we decide we can use the common crawl file attached to Bug 60677, " +
|
|
||||||
"we can turn this back on")
|
|
||||||
public void testShift_JIS() throws Exception {
|
public void testShift_JIS() throws Exception {
|
||||||
//TODO: move test file to framework and fix this
|
//this file derives from common crawl: see Bug 60677
|
||||||
File f = new File("C:/data/file8.wmf");
|
HwmfPicture wmf = null;
|
||||||
HwmfPicture wmf = new HwmfPicture(new FileInputStream(f));
|
try (InputStream fis = samples.openResourceAsStream("60677.wmf")) {
|
||||||
|
wmf = new HwmfPicture(fis);
|
||||||
|
}
|
||||||
|
|
||||||
Charset charset = LocaleUtil.CHARSET_1252;
|
Charset charset = LocaleUtil.CHARSET_1252;
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
@ -263,4 +264,21 @@ public class TestHwmfParsing {
|
||||||
String txt = sb.toString();
|
String txt = sb.toString();
|
||||||
assertContains(txt, "\u822A\u7A7A\u60C5\u5831\u696D\u52D9\u3078\u306E\uFF27\uFF29\uFF33");
|
assertContains(txt, "\u822A\u7A7A\u60C5\u5831\u696D\u52D9\u3078\u306E\uFF27\uFF29\uFF33");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLengths() throws Exception {
|
||||||
|
//both substring and length rely on char, not codepoints.
|
||||||
|
//This test confirms that the substring calls in HwmfText
|
||||||
|
//will not truncate even beyond-bmp data.
|
||||||
|
//The last character (Deseret AY U+1040C) is comprised of 2 utf16 surrogates/codepoints
|
||||||
|
String s = "\u666E\u6797\u65AF\uD801\uDC0C";
|
||||||
|
Charset utf16LE = StandardCharsets.UTF_16LE;
|
||||||
|
byte[] bytes = s.getBytes(utf16LE);
|
||||||
|
String rebuilt = new String(bytes, utf16LE);
|
||||||
|
rebuilt = rebuilt.substring(0, Math.min(bytes.length, rebuilt.length()));
|
||||||
|
assertEquals(s, rebuilt);
|
||||||
|
assertEquals(5, rebuilt.length());
|
||||||
|
long cnt = rebuilt.codePoints().count();
|
||||||
|
assertEquals(4, cnt);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue