From ad11a5d3631c0b54ab793407517a67552b9f8e2e Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Mon, 8 Apr 2019 19:51:16 +0000 Subject: [PATCH] Bug 63323 -- improve handling of multibyte characters git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1857135 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/poi/hwmf/draw/HwmfGraphics.java | 6 +++- .../org/apache/poi/hwmf/record/HwmfText.java | 7 ++++- .../org/apache/poi/hwmf/TestHwmfParsing.java | 28 +++++++++++++++---- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java index 05f5d200be..766f9020df 100644 --- a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java +++ b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java @@ -400,7 +400,11 @@ public class HwmfGraphics { } } - String textString = new String(text, charset).substring(0,length).trim(); + String textString = ""; + if (text != null) { + textString = new String(text, charset).trim(); + textString = textString.substring(0, Math.min(textString.length(), length)); + } if (textString.isEmpty()) { return; diff --git a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java index 061f5cfc66..6af61b81ac 100644 --- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java +++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java @@ -395,7 +395,12 @@ public class HwmfText { } public String getText(Charset charset) throws IOException { - return new String(rawTextBytes, charset).substring(0, stringLength); + if (rawTextBytes == null) { + return ""; + } + String ret = new String(rawTextBytes, charset); + return ret.substring(0, + Math.min(ret.length(), stringLength)); } public Point2D getReference() { diff --git a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java index 4a988250fe..7632284296 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java +++ b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java @@ -35,6 +35,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Locale; import java.util.zip.ZipEntry; @@ -238,12 +239,12 @@ public class TestHwmfParsing { } @Test - @Ignore("If we decide we can use the common crawl file attached to Bug 60677, " + - "we can turn this back on") public void testShift_JIS() throws Exception { - //TODO: move test file to framework and fix this - File f = new File("C:/data/file8.wmf"); - HwmfPicture wmf = new HwmfPicture(new FileInputStream(f)); + //this file derives from common crawl: see Bug 60677 + HwmfPicture wmf = null; + try (InputStream fis = samples.openResourceAsStream("60677.wmf")) { + wmf = new HwmfPicture(fis); + } Charset charset = LocaleUtil.CHARSET_1252; StringBuilder sb = new StringBuilder(); @@ -263,4 +264,21 @@ public class TestHwmfParsing { String txt = sb.toString(); assertContains(txt, "\u822A\u7A7A\u60C5\u5831\u696D\u52D9\u3078\u306E\uFF27\uFF29\uFF33"); } + + @Test + public void testLengths() throws Exception { + //both substring and length rely on char, not codepoints. + //This test confirms that the substring calls in HwmfText + //will not truncate even beyond-bmp data. + //The last character (Deseret AY U+1040C) is comprised of 2 utf16 surrogates/codepoints + String s = "\u666E\u6797\u65AF\uD801\uDC0C"; + Charset utf16LE = StandardCharsets.UTF_16LE; + byte[] bytes = s.getBytes(utf16LE); + String rebuilt = new String(bytes, utf16LE); + rebuilt = rebuilt.substring(0, Math.min(bytes.length, rebuilt.length())); + assertEquals(s, rebuilt); + assertEquals(5, rebuilt.length()); + long cnt = rebuilt.codePoints().count(); + assertEquals(4, cnt); + } }