[Bug 61246] fix issue where SXSSF sheet data has unicode surrogate chars replaced by '?'

git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1800705 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
PJ Fanning 2017-07-03 18:49:25 +00:00
parent dc0a33679c
commit 2f417e17bd
2 changed files with 96 additions and 7 deletions

View File

@ -122,12 +122,16 @@ public class SheetDataWriter {
* flush and close the temp data writer. * flush and close the temp data writer.
* This method <em>must</em> be invoked before calling {@link #getWorksheetXMLInputStream()} * This method <em>must</em> be invoked before calling {@link #getWorksheetXMLInputStream()}
*/ */
public void close() throws IOException{ public void close() throws IOException {
_out.flush(); flush();
_out.close(); _out.close();
} }
protected void flush() throws IOException {
_out.flush();
}
protected File getTempFile(){ protected File getTempFile() {
return _fd; return _fd;
} }
@ -329,7 +333,7 @@ public class SheetDataWriter {
} }
//Taken from jdk1.3/src/javax/swing/text/html/HTMLWriter.java //Taken from jdk1.3/src/javax/swing/text/html/HTMLWriter.java
protected void outputQuotedString(String s) throws IOException { protected void outputQuotedString(String s) throws IOException {
if (s == null || s.length() == 0) { if (s == null || s.length() == 0) {
return; return;
} }
@ -393,15 +397,21 @@ public class SheetDataWriter {
break; break;
default: default:
// YK: XmlBeans silently replaces all ISO control characters ( < 32) with question marks. // YK: XmlBeans silently replaces all ISO control characters ( < 32) with question marks.
// the same rule applies to unicode surrogates and "not a character" symbols. // the same rule applies to "not a character" symbols.
if( c < ' ' || Character.isLowSurrogate(c) || Character.isHighSurrogate(c) || if (replaceWithQuestionMark(c)) {
('\uFFFE' <= c && c <= '\uFFFF')) {
if (counter > last) { if (counter > last) {
_out.write(chars, last, counter - last); _out.write(chars, last, counter - last);
} }
_out.write('?'); _out.write('?');
last = counter + 1; last = counter + 1;
} }
else if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) {
if (counter > last) {
_out.write(chars, last, counter - last);
}
_out.write(c);
last = counter + 1;
}
else if (c > 127) { else if (c > 127) {
if (counter > last) { if (counter > last) {
_out.write(chars, last, counter - last); _out.write(chars, last, counter - last);
@ -421,6 +431,10 @@ public class SheetDataWriter {
} }
} }
static boolean replaceWithQuestionMark(char c) {
return c < ' ' || ('\uFFFE' <= c && c <= '\uFFFF');
}
/** /**
* Deletes the temporary file that backed this sheet on disk. * Deletes the temporary file that backed this sheet on disk.
* @return true if the file was deleted, false if it wasn't. * @return true if the file was deleted, false if it wasn't.

View File

@ -0,0 +1,75 @@
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ====================================================================
*/
package org.apache.poi.xssf.streaming;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.poi.util.IOUtils;
import org.junit.Test;
public final class TestSheetDataWriter {
final String unicodeSurrogates = "\uD835\uDF4A\uD835\uDF4B\uD835\uDF4C\uD835\uDF4D\uD835\uDF4E"
+ "\uD835\uDF4F\uD835\uDF50\uD835\uDF51\uD835\uDF52\uD835\uDF53\uD835\uDF54\uD835"
+ "\uDF55\uD835\uDF56\uD835\uDF57\uD835\uDF58\uD835\uDF59\uD835\uDF5A\uD835\uDF5B"
+ "\uD835\uDF5C\uD835\uDF5D\uD835\uDF5E\uD835\uDF5F\uD835\uDF60\uD835\uDF61\uD835"
+ "\uDF62\uD835\uDF63\uD835\uDF64\uD835\uDF65\uD835\uDF66\uD835\uDF67\uD835\uDF68"
+ "\uD835\uDF69\uD835\uDF6A\uD835\uDF6B\uD835\uDF6C\uD835\uDF6D\uD835\uDF6E\uD835"
+ "\uDF6F\uD835\uDF70\uD835\uDF71\uD835\uDF72\uD835\uDF73\uD835\uDF74\uD835\uDF75"
+ "\uD835\uDF76\uD835\uDF77\uD835\uDF78\uD835\uDF79\uD835\uDF7A";
@Test
public void testReplaceWithQuestionMark() {
for(int i = 0; i < unicodeSurrogates.length(); i++) {
assertFalse(SheetDataWriter.replaceWithQuestionMark(unicodeSurrogates.charAt(i)));
}
assertTrue(SheetDataWriter.replaceWithQuestionMark('\uFFFE'));
assertTrue(SheetDataWriter.replaceWithQuestionMark('\uFFFF'));
assertTrue(SheetDataWriter.replaceWithQuestionMark('\u0000'));
assertTrue(SheetDataWriter.replaceWithQuestionMark('\u000F'));
assertTrue(SheetDataWriter.replaceWithQuestionMark('\u001F'));
}
@Test
public void testWriteUnicodeSurrogates() throws IOException {
SheetDataWriter writer = new SheetDataWriter();
try {
writer.outputQuotedString(unicodeSurrogates);
writer.flush();
File file = writer.getTempFile();
FileInputStream is = new FileInputStream(file);
String text;
try {
text = new String(IOUtils.toByteArray(is), "UTF-8");
} finally {
is.close();
}
assertEquals(unicodeSurrogates, text);
} finally {
writer.close();
}
}
}