From dc66cc51586ae5e3ec862d71543087b601abab48 Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Thu, 19 Jan 2017 20:19:26 +0000 Subject: [PATCH] Bug 60608 -- improve charset handling in Hwmf git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1779519 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/poi/hwmf/draw/HwmfGraphics.java | 37 +++++++++-- .../org/apache/poi/hwmf/record/HwmfFont.java | 66 +++++++++++++------ .../org/apache/poi/hwmf/record/HwmfText.java | 60 +++++++++++++---- .../org/apache/poi/hwmf/TestHwmfParsing.java | 38 ++++++++++- 4 files changed, 161 insertions(+), 40 deletions(-) diff --git a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java index 5c0f35bd03..2716696dee 100644 --- a/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java +++ b/src/scratchpad/src/org/apache/poi/hwmf/draw/HwmfGraphics.java @@ -29,6 +29,7 @@ import java.awt.font.TextAttribute; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; +import java.nio.charset.Charset; import java.text.AttributedString; import java.util.ArrayList; import java.util.LinkedList; @@ -48,8 +49,11 @@ import org.apache.poi.hwmf.record.HwmfPenStyle.HwmfLineDash; import org.apache.poi.sl.draw.DrawFactory; import org.apache.poi.sl.draw.DrawFontManager; import org.apache.poi.sl.draw.Drawable; +import org.apache.poi.util.LocaleUtil; public class HwmfGraphics { + + private static final Charset DEFAULT_CHARSET = LocaleUtil.CHARSET_1252; private final Graphics2D graphicsCtx; private final List propStack = new LinkedList(); private HwmfDrawProperties prop = new HwmfDrawProperties(); @@ -311,14 +315,34 @@ public class HwmfGraphics { break; } } - + + /** + * + * @param text + * @param bounds + * @deprecated use {@link #drawString(byte[], Rectangle2D)} + */ public void drawString(String text, Rectangle2D bounds) { drawString(text, bounds, null); } - + + public void drawString(byte[] text, Rectangle2D bounds) { + drawString(text, bounds, null); + } + + /** + * + * @param text + * @param bounds + * @deprecated use {@link #drawString(byte[], Rectangle2D, int[])} + */ public void drawString(String text, Rectangle2D bounds, int dx[]) { + drawString(text.getBytes(DEFAULT_CHARSET), bounds, dx); + } + + public void drawString(byte[] text, Rectangle2D bounds, int dx[]) { HwmfFont font = prop.getFont(); - if (font == null || text == null || text.isEmpty()) { + if (font == null || text == null || text.length == 0) { return; } @@ -326,8 +350,11 @@ public class HwmfGraphics { // TODO: another approx. ... double fontW = fontH/1.8; - int len = text.length(); - AttributedString as = new AttributedString(text); + int len = text.length; + Charset charset = (font.getCharSet().getCharset() == null)? + DEFAULT_CHARSET : font.getCharSet().getCharset(); + String textString = new String(text, charset); + AttributedString as = new AttributedString(textString); if (dx == null || dx.length == 0) { addAttributes(as, font); } else { diff --git a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java index fce75a3656..9225de47ec 100644 --- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java +++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfFont.java @@ -19,67 +19,93 @@ package org.apache.poi.hwmf.record; import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.UnsupportedCharsetException; import org.apache.poi.util.LittleEndianConsts; import org.apache.poi.util.LittleEndianInputStream; +import org.apache.poi.util.POILogFactory; +import org.apache.poi.util.POILogger; /** * The Font object specifies the attributes of a logical font */ public class HwmfFont { + + private static final POILogger logger = POILogFactory.getLogger(HwmfFont.class); + public enum WmfCharset { /** Specifies the English character set. */ - ANSI_CHARSET(0x00000000), + ANSI_CHARSET(0x00000000, "Cp1252"), /** * Specifies a character set based on the current system locale; * for example, when the system locale is United States English, * the default character set is ANSI_CHARSET. */ - DEFAULT_CHARSET(0x00000001), + DEFAULT_CHARSET(0x00000001, "Cp1252"), /** Specifies a character set of symbols. */ - SYMBOL_CHARSET(0x00000002), + SYMBOL_CHARSET(0x00000002, ""), /** Specifies the Apple Macintosh character set. */ - MAC_CHARSET(0x0000004D), + MAC_CHARSET(0x0000004D, "MacRoman"), /** Specifies the Japanese character set. */ - SHIFTJIS_CHARSET(0x00000080), + SHIFTJIS_CHARSET(0x00000080, "Shift_JIS"), /** Also spelled "Hangeul". Specifies the Hangul Korean character set. */ - HANGUL_CHARSET(0x00000081), + HANGUL_CHARSET(0x00000081, "cp949"), /** Also spelled "Johap". Specifies the Johab Korean character set. */ - JOHAB_CHARSET(0x00000082), + JOHAB_CHARSET(0x00000082, "x-Johab"), /** Specifies the "simplified" Chinese character set for People's Republic of China. */ - GB2312_CHARSET(0x00000086), + GB2312_CHARSET(0x00000086, "GB2312"), /** * Specifies the "traditional" Chinese character set, used mostly in * Taiwan and in the Hong Kong and Macao Special Administrative Regions. */ - CHINESEBIG5_CHARSET(0x00000088), + CHINESEBIG5_CHARSET(0x00000088, "Big5"), /** Specifies the Greek character set. */ - GREEK_CHARSET(0x000000A1), + GREEK_CHARSET(0x000000A1, "Cp1253"), /** Specifies the Turkish character set. */ - TURKISH_CHARSET(0x000000A2), + TURKISH_CHARSET(0x000000A2, "Cp1254"), /** Specifies the Vietnamese character set. */ - VIETNAMESE_CHARSET(0x000000A3), + VIETNAMESE_CHARSET(0x000000A3, "Cp1258"), /** Specifies the Hebrew character set. */ - HEBREW_CHARSET(0x000000B1), + HEBREW_CHARSET(0x000000B1, "Cp1255"), /** Specifies the Arabic character set. */ - ARABIC_CHARSET(0x000000B2), + ARABIC_CHARSET(0x000000B2, "Cp1256"), /** Specifies the Baltic (Northeastern European) character set. */ - BALTIC_CHARSET(0x000000BA), + BALTIC_CHARSET(0x000000BA, "Cp1257"), /** Specifies the Russian Cyrillic character set. */ - RUSSIAN_CHARSET(0x000000CC), + RUSSIAN_CHARSET(0x000000CC, "Cp1251"), /** Specifies the Thai character set. */ - THAI_CHARSET(0x000000DE), + THAI_CHARSET(0x000000DE, "x-windows-874"), /** Specifies a Eastern European character set. */ - EASTEUROPE_CHARSET(0x000000EE), + EASTEUROPE_CHARSET(0x000000EE, "Cp1250"), /** * Specifies a mapping to one of the OEM code pages, * according to the current system locale setting. */ - OEM_CHARSET(0x000000FF); + OEM_CHARSET(0x000000FF, "Cp1252"); int flag; - WmfCharset(int flag) { + Charset charset; + + WmfCharset(int flag, String javaCharsetName) { this.flag = flag; + if (javaCharsetName.length() > 0) { + try { + charset = Charset.forName(javaCharsetName); + return; + } catch (UnsupportedCharsetException e) { + logger.log(POILogger.WARN, "Unsupported charset: "+javaCharsetName); + } + } + charset = null; + } + + /** + * + * @return charset for the font or null if there is no matching charset or + * if the charset is a "default" + */ + public Charset getCharset() { + return charset; } static WmfCharset valueOf(int flag) { diff --git a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java index 03ebc353c4..108970d761 100644 --- a/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java +++ b/src/scratchpad/src/org/apache/poi/hwmf/record/HwmfText.java @@ -19,6 +19,7 @@ package org.apache.poi.hwmf.record; import java.awt.geom.Rectangle2D; import java.io.IOException; +import java.nio.charset.Charset; import org.apache.poi.hwmf.draw.HwmfDrawProperties; import org.apache.poi.hwmf.draw.HwmfGraphics; @@ -27,7 +28,6 @@ import org.apache.poi.util.BitField; import org.apache.poi.util.BitFieldFactory; import org.apache.poi.util.LittleEndianConsts; import org.apache.poi.util.LittleEndianInputStream; -import org.apache.poi.util.LocaleUtil; import org.apache.poi.util.POILogFactory; import org.apache.poi.util.POILogger; @@ -144,7 +144,7 @@ public class HwmfText { * length of the string. * The string is written at the location specified by the XStart and YStart fields. */ - private String text; + private byte[] rawTextBytes; /** * A 16-bit signed integer that defines the vertical (y-axis) coordinate, in logical * units, of the point where drawing is to start. @@ -164,18 +164,33 @@ public class HwmfText { @Override public int init(LittleEndianInputStream leis, long recordSize, int recordFunction) throws IOException { stringLength = leis.readShort(); - byte buf[] = new byte[stringLength+(stringLength&1)]; - leis.readFully(buf); - text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252).trim(); + rawTextBytes = new byte[stringLength+(stringLength&1)]; + leis.readFully(rawTextBytes); yStart = leis.readShort(); xStart = leis.readShort(); - return 3*LittleEndianConsts.SHORT_SIZE+buf.length; + return 3*LittleEndianConsts.SHORT_SIZE+rawTextBytes.length; } @Override public void draw(HwmfGraphics ctx) { Rectangle2D bounds = new Rectangle2D.Double(xStart, yStart, 0, 0); - ctx.drawString(text, bounds); + ctx.drawString(getTextBytes(), bounds); + } + + public String getText(Charset charset) { + return new String(getTextBytes(), charset); + } + + /** + * + * @return a copy of a trimmed byte array of rawTextBytes bytes. + * This includes only the bytes from 0..stringLength. + * This does not include the extra optional padding on the byte array. + */ + private byte[] getTextBytes() { + byte[] ret = new byte[stringLength]; + System.arraycopy(rawTextBytes, 0, ret, 0, stringLength); + return ret; } } @@ -264,7 +279,7 @@ public class HwmfText { * the length is odd, an extra byte is placed after it so that the following member (optional Dx) is * aligned on a 16-bit boundary. */ - private String text; + private byte[] rawTextBytes; /** * An optional array of 16-bit signed integers that indicate the distance between * origins of adjacent character cells. For example, Dx[i] logical units separate the origins of @@ -300,10 +315,9 @@ public class HwmfText { size += 4*LittleEndianConsts.SHORT_SIZE; } - byte buf[] = new byte[stringLength+(stringLength&1)]; - leis.readFully(buf); - text = new String(buf, 0, stringLength, LocaleUtil.CHARSET_1252); - size += buf.length; + rawTextBytes = new byte[stringLength+(stringLength&1)]; + leis.readFully(rawTextBytes); + size += rawTextBytes.length; if (size >= remainingRecordSize) { logger.log(POILogger.INFO, "META_EXTTEXTOUT doesn't contain character tracking info"); @@ -327,7 +341,23 @@ public class HwmfText { @Override public void draw(HwmfGraphics ctx) { Rectangle2D bounds = new Rectangle2D.Double(x, y, 0, 0); - ctx.drawString(text, bounds, dx); + ctx.drawString(getTextBytes(), bounds, dx); + } + + public String getText(Charset charset) { + return new String(getTextBytes(), charset); + } + + /** + * + * @return a copy of a trimmed byte array of rawTextBytes bytes. + * This includes only the bytes from 0..stringLength. + * This does not include the extra optional padding on the byte array. + */ + private byte[] getTextBytes() { + byte[] ret = new byte[stringLength]; + System.arraycopy(rawTextBytes, 0, ret, 0, stringLength); + return ret; } } @@ -523,5 +553,9 @@ public class HwmfText { public void applyObject(HwmfGraphics ctx) { ctx.getProperties().setFont(font); } + + public HwmfFont getFont() { + return font; + } } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java index 17226a654b..2f0838f128 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java +++ b/src/scratchpad/testcases/org/apache/poi/hwmf/TestHwmfParsing.java @@ -18,7 +18,9 @@ package org.apache.poi.hwmf; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import javax.imageio.ImageIO; import java.awt.Dimension; import java.awt.Graphics2D; import java.awt.RenderingHints; @@ -31,21 +33,24 @@ import java.io.FileOutputStream; import java.io.FilterInputStream; import java.io.IOException; import java.net.URL; +import java.nio.charset.Charset; import java.util.List; import java.util.Locale; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import javax.imageio.ImageIO; - import org.apache.poi.POIDataSamples; import org.apache.poi.hwmf.record.HwmfFill.HwmfImageRecord; +import org.apache.poi.hwmf.record.HwmfFont; import org.apache.poi.hwmf.record.HwmfRecord; +import org.apache.poi.hwmf.record.HwmfRecordType; +import org.apache.poi.hwmf.record.HwmfText; import org.apache.poi.hwmf.usermodel.HwmfPicture; import org.apache.poi.sl.usermodel.PictureData; import org.apache.poi.sl.usermodel.PictureData.PictureType; import org.apache.poi.sl.usermodel.SlideShow; import org.apache.poi.sl.usermodel.SlideShowFactory; +import org.apache.poi.util.LocaleUtil; import org.apache.poi.util.Units; import org.junit.Ignore; import org.junit.Test; @@ -188,4 +193,33 @@ public class TestHwmfParsing { } } } + + @Test + @Ignore("If we decide we can use common crawl file specified, we can turn this back on") + public void testCyrillic() throws Exception { + //TODO: move test file to framework and fix this + File dir = new File("C:/somethingOrOther"); + File f = new File(dir, "ZMLH54SPLI76NQ7XMKVB7SMUJA2HTXTS-2.wmf"); + HwmfPicture wmf = new HwmfPicture(new FileInputStream(f)); + + Charset charset = LocaleUtil.CHARSET_1252; + StringBuilder sb = new StringBuilder(); + //this is pure hackery for specifying the font + //this happens to work on this test file, but you need to + //do what Graphics does by maintaining the stack, etc.! + for (HwmfRecord r : wmf.getRecords()) { + if (r.getRecordType().equals(HwmfRecordType.createFontIndirect)) { + HwmfFont font = ((HwmfText.WmfCreateFontIndirect)r).getFont(); + charset = (font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset(); + } + if (r.getRecordType().equals(HwmfRecordType.extTextOut)) { + HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut)r; + sb.append(textOut.getText(charset)).append("\n"); + } + } + String txt = sb.toString(); + assertTrue(txt.contains("\u041E\u0431\u0449\u043E")); + assertTrue(txt.contains("\u0411\u0430\u043B\u0430\u043D\u0441")); + } + }