From 2f417e17bd408a81fb435582db3808a42ac90f2b Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Mon, 3 Jul 2017 18:49:25 +0000 Subject: [PATCH] [Bug 61246] fix issue where SXSSF sheet data has unicode surrogate chars replaced by '?' git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1800705 13f79535-47bb-0310-9956-ffa450edef68 --- .../poi/xssf/streaming/SheetDataWriter.java | 28 +++++-- .../xssf/streaming/TestSheetDataWriter.java | 75 +++++++++++++++++++ 2 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 src/ooxml/testcases/org/apache/poi/xssf/streaming/TestSheetDataWriter.java diff --git a/src/ooxml/java/org/apache/poi/xssf/streaming/SheetDataWriter.java b/src/ooxml/java/org/apache/poi/xssf/streaming/SheetDataWriter.java index cec86e9669..d8394a800e 100644 --- a/src/ooxml/java/org/apache/poi/xssf/streaming/SheetDataWriter.java +++ b/src/ooxml/java/org/apache/poi/xssf/streaming/SheetDataWriter.java @@ -122,12 +122,16 @@ public class SheetDataWriter { * flush and close the temp data writer. * This method must be invoked before calling {@link #getWorksheetXMLInputStream()} */ - public void close() throws IOException{ - _out.flush(); + public void close() throws IOException { + flush(); _out.close(); } + + protected void flush() throws IOException { + _out.flush(); + } - protected File getTempFile(){ + protected File getTempFile() { return _fd; } @@ -329,7 +333,7 @@ public class SheetDataWriter { } //Taken from jdk1.3/src/javax/swing/text/html/HTMLWriter.java - protected void outputQuotedString(String s) throws IOException { + protected void outputQuotedString(String s) throws IOException { if (s == null || s.length() == 0) { return; } @@ -393,15 +397,21 @@ public class SheetDataWriter { break; default: // YK: XmlBeans silently replaces all ISO control characters ( < 32) with question marks. - // the same rule applies to unicode surrogates and "not a character" symbols. - if( c < ' ' || Character.isLowSurrogate(c) || Character.isHighSurrogate(c) || - ('\uFFFE' <= c && c <= '\uFFFF')) { + // the same rule applies to "not a character" symbols. + if (replaceWithQuestionMark(c)) { if (counter > last) { _out.write(chars, last, counter - last); } _out.write('?'); last = counter + 1; } + else if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) { + if (counter > last) { + _out.write(chars, last, counter - last); + } + _out.write(c); + last = counter + 1; + } else if (c > 127) { if (counter > last) { _out.write(chars, last, counter - last); @@ -421,6 +431,10 @@ public class SheetDataWriter { } } + static boolean replaceWithQuestionMark(char c) { + return c < ' ' || ('\uFFFE' <= c && c <= '\uFFFF'); + } + /** * Deletes the temporary file that backed this sheet on disk. * @return true if the file was deleted, false if it wasn't. diff --git a/src/ooxml/testcases/org/apache/poi/xssf/streaming/TestSheetDataWriter.java b/src/ooxml/testcases/org/apache/poi/xssf/streaming/TestSheetDataWriter.java new file mode 100644 index 0000000000..343180b409 --- /dev/null +++ b/src/ooxml/testcases/org/apache/poi/xssf/streaming/TestSheetDataWriter.java @@ -0,0 +1,75 @@ +/* + * ==================================================================== + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +package org.apache.poi.xssf.streaming; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; + +import org.apache.poi.util.IOUtils; +import org.junit.Test; + +public final class TestSheetDataWriter { + + final String unicodeSurrogates = "\uD835\uDF4A\uD835\uDF4B\uD835\uDF4C\uD835\uDF4D\uD835\uDF4E" + + "\uD835\uDF4F\uD835\uDF50\uD835\uDF51\uD835\uDF52\uD835\uDF53\uD835\uDF54\uD835" + + "\uDF55\uD835\uDF56\uD835\uDF57\uD835\uDF58\uD835\uDF59\uD835\uDF5A\uD835\uDF5B" + + "\uD835\uDF5C\uD835\uDF5D\uD835\uDF5E\uD835\uDF5F\uD835\uDF60\uD835\uDF61\uD835" + + "\uDF62\uD835\uDF63\uD835\uDF64\uD835\uDF65\uD835\uDF66\uD835\uDF67\uD835\uDF68" + + "\uD835\uDF69\uD835\uDF6A\uD835\uDF6B\uD835\uDF6C\uD835\uDF6D\uD835\uDF6E\uD835" + + "\uDF6F\uD835\uDF70\uD835\uDF71\uD835\uDF72\uD835\uDF73\uD835\uDF74\uD835\uDF75" + + "\uD835\uDF76\uD835\uDF77\uD835\uDF78\uD835\uDF79\uD835\uDF7A"; + + @Test + public void testReplaceWithQuestionMark() { + for(int i = 0; i < unicodeSurrogates.length(); i++) { + assertFalse(SheetDataWriter.replaceWithQuestionMark(unicodeSurrogates.charAt(i))); + } + assertTrue(SheetDataWriter.replaceWithQuestionMark('\uFFFE')); + assertTrue(SheetDataWriter.replaceWithQuestionMark('\uFFFF')); + assertTrue(SheetDataWriter.replaceWithQuestionMark('\u0000')); + assertTrue(SheetDataWriter.replaceWithQuestionMark('\u000F')); + assertTrue(SheetDataWriter.replaceWithQuestionMark('\u001F')); + } + + @Test + public void testWriteUnicodeSurrogates() throws IOException { + SheetDataWriter writer = new SheetDataWriter(); + try { + writer.outputQuotedString(unicodeSurrogates); + writer.flush(); + File file = writer.getTempFile(); + FileInputStream is = new FileInputStream(file); + String text; + try { + text = new String(IOUtils.toByteArray(is), "UTF-8"); + } finally { + is.close(); + } + assertEquals(unicodeSurrogates, text); + } finally { + writer.close(); + } + } +}