Escaping unicode uses capital letters e.g. \uABCD

Found and fixed bug when unicode character is at the end of a string to unescape
Added unit tests for above bug to both StringUtilsTest and StringEscapeUtilsTest
StringUtils.[un]escape now call StringEscapeUtils.[un]escapeJava


git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137291 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Alex Chaffee 2003-04-09 18:45:29 +00:00
parent 2e862e0c47
commit 6af3b80369
4 changed files with 52 additions and 148 deletions

View File

@ -75,7 +75,7 @@ import org.apache.commons.lang.exception.NestableRuntimeException;
* @author Helge Tesgaard
* @author <a href="sean@boohai.com">Sean Brown</a>
* @since 2.0
* @version $Id: StringEscapeUtils.java,v 1.4 2003/04/09 00:07:49 ggregory Exp $
* @version $Id: StringEscapeUtils.java,v 1.5 2003/04/09 18:45:28 alex Exp $
*/
public class StringEscapeUtils {
@ -184,11 +184,11 @@ public class StringEscapeUtils {
// handle unicode
if (ch > 0xfff) {
out.write("\\u" + Integer.toHexString(ch));
out.write("\\u" + hex(ch));
} else if (ch > 0xff) {
out.write("\\u0" + Integer.toHexString(ch));
out.write("\\u0" + hex(ch));
} else if (ch > 0x7f) {
out.write("\\u00" + Integer.toHexString(ch));
out.write("\\u00" + hex(ch));
} else if (ch < 32) {
switch (ch) {
case '\b':
@ -213,9 +213,9 @@ public class StringEscapeUtils {
break;
default :
if (ch > 0xf) {
out.write("\\u00" + Integer.toHexString(ch));
out.write("\\u00" + hex(ch));
} else {
out.write("\\u000" + Integer.toHexString(ch));
out.write("\\u000" + hex(ch));
}
break;
}
@ -241,6 +241,10 @@ public class StringEscapeUtils {
}
}
private static String hex(char ch) {
return Integer.toHexString(ch).toUpperCase();
}
/**
* Unescapes any Java literals found in the String. For example,
* it will turn a sequence of '\' and 'n' into a newline character,
@ -268,6 +272,7 @@ public class StringEscapeUtils {
if (inUnicode) {
// if in unicode, then we're reading unicode
// values in somehow
unicode.append(ch);
if (unicode.length() == 4) {
// unicode now contains the four hex digits
// which represents our unicode chacater
@ -275,16 +280,13 @@ public class StringEscapeUtils {
int value = Integer.parseInt(unicode.toString(), 16);
out.write((char) value);
unicode.setLength(0);
unicode.setLength(4);
inUnicode = false;
hadSlash = false;
} catch (NumberFormatException nfe) {
throw new NestableRuntimeException("Unable to parse unicode value: " + unicode, nfe);
}
} else {
unicode.append(ch);
continue;
}
continue;
}
if (hadSlash) {
// handle an escaped value

View File

@ -78,7 +78,7 @@ import org.apache.commons.lang.exception.NestableRuntimeException;
* @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
* @author Arun Mammen Thomas
* @since 1.0
* @version $Id: StringUtils.java,v 1.41 2003/04/09 00:07:50 ggregory Exp $
* @version $Id: StringUtils.java,v 1.42 2003/04/09 18:45:29 alex Exp $
*/
public class StringUtils {
@ -1140,147 +1140,30 @@ public class StringUtils {
* <p>So a tab becomes the characters <code>'\\'</code> and
* <code>'t'</code>.</p>
*
* <p>As of Lang 2.0, this calls {@link StringEscapeUtils#escapeJava(java.lang.String)}
* behind the scenes. For convenience, this method is not deprecated.
* </p>
* @see StringEscapeUtils#escapeJava(java.lang.String)
* @param str String to escape values in
* @return String with escaped values
* @throws NullPointerException if str is <code>null</code>
*/
public static String escape(String str) {
// improved with code from cybertiger@cyberiantiger.org
// unicode from him, and defaul for < 32's.
int sz = str.length();
StringBuffer buffer = new StringBuffer(2 * sz);
for (int i = 0; i < sz; i++) {
char ch = str.charAt(i);
// handle unicode
if (ch > 0xfff) {
buffer.append("\\u" + Integer.toHexString(ch));
} else if (ch > 0xff) {
buffer.append("\\u0" + Integer.toHexString(ch));
} else if (ch > 0x7f) {
buffer.append("\\u00" + Integer.toHexString(ch));
} else if (ch < 32) {
switch (ch) {
case '\b' :
buffer.append('\\');
buffer.append('b');
break;
case '\n' :
buffer.append('\\');
buffer.append('n');
break;
case '\t' :
buffer.append('\\');
buffer.append('t');
break;
case '\f' :
buffer.append('\\');
buffer.append('f');
break;
case '\r' :
buffer.append('\\');
buffer.append('r');
break;
default :
if (ch > 0xf) {
buffer.append("\\u00" + Integer.toHexString(ch));
} else {
buffer.append("\\u000" + Integer.toHexString(ch));
}
break;
}
} else {
switch (ch) {
case '\'' :
buffer.append('\\');
buffer.append('\'');
break;
case '"' :
buffer.append('\\');
buffer.append('"');
break;
case '\\' :
buffer.append('\\');
buffer.append('\\');
break;
default :
buffer.append(ch);
break;
}
}
}
return buffer.toString();
return StringEscapeUtils.escapeJava(str);
}
/**
* Unescapes any Java literals found in the String. For example,
* it will turn a sequence of '\' and 'n' into a newline character,
* unless the '\' is preceded by another '\'.
* <p>
* As of Lang 2.0, this calls {@link StringEscapeUtils#unescapeJava(java.lang.String)}
* behind the scenes. For convenience, this method is not deprecated.
* <p>
* @see StringEscapeUtils#unescapeJava(java.lang.String)
*/
public static String unescape(String str) {
int sz = str.length();
StringBuffer buffer = new StringBuffer(sz);
StringBuffer unicode = new StringBuffer(4);
boolean hadSlash = false;
boolean inUnicode = false;
for (int i = 0; i < sz; i++) {
char ch = str.charAt(i);
if(inUnicode) {
// if in unicode, then we're reading unicode
// values in somehow
if(unicode.length() == 4) {
// unicode now contains the four hex digits
// which represents our unicode chacater
try {
int value = Integer.parseInt(unicode.toString(), 16);
buffer.append( (char)value );
unicode.setLength(0);
unicode.setLength(4);
inUnicode = false;
hadSlash = false;
} catch(NumberFormatException nfe) {
throw new NestableRuntimeException("Unable to parse unicode value: "+unicode, nfe);
}
} else {
unicode.append(ch);
continue;
}
}
if(hadSlash) {
// handle an escaped value
hadSlash = false;
switch(ch) {
case '\\': buffer.append('\\'); break;
case '\'': buffer.append('\''); break;
case '\"': buffer.append('"'); break;
case 'r': buffer.append('\r'); break;
case 'f': buffer.append('\f'); break;
case 't': buffer.append('\t'); break;
case 'n': buffer.append('\n'); break;
case 'b': buffer.append('\b'); break;
case 'u': {
// uh-oh, we're in unicode country....
inUnicode=true;
break;
}
default :
buffer.append(ch);
break;
}
continue;
} else
if(ch == '\\') {
hadSlash = true;
continue;
}
buffer.append(ch);
}
if(hadSlash) {
// then we're in the weird case of a \ at the end of the
// string, let's output it anyway.
buffer.append('\\');
}
return buffer.toString();
return StringEscapeUtils.unescapeJava(str);
}
// Padding

View File

@ -62,11 +62,11 @@ import junit.framework.TestSuite;
import junit.textui.TestRunner;
/**
* Unit tests {@link StringUtils}.
* Unit tests for {@link StringEscapeUtils}.
*
* @author of original StringUtilsTest.testEscape = ?
* @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
* @version $Id: StringEscapeUtilsTest.java,v 1.2 2003/04/09 17:30:29 alex Exp $
* @version $Id: StringEscapeUtilsTest.java,v 1.3 2003/04/09 18:45:29 alex Exp $
*/
public class StringEscapeUtilsTest extends TestCase {
private final static String FOO = "foo";
@ -96,13 +96,15 @@ public class StringEscapeUtilsTest extends TestCase {
assertEscapeJava("\\\\\\b\\t\\r", "\\\b\t\r");
assertEscapeJava("\\u1234", "\u1234");
assertEscapeJava("\\u0234", "\u0234");
assertEscapeJava("\\u00fd", "\u00fd");
assertEscapeJava("\\u00EF", "\u00ef");
assertEscapeJava("\\u0001", "\u0001");
assertEscapeJava("Should use capitalized unicode hex", "\\uABCD", "\uabcd");
assertEscapeJava("He didn't say, \\\"stop!\\\"",
"He didn't say, \"stop!\"");
assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00a0",
assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00A0",
"This space is non-breaking:\u00a0");
assertEscapeJava("\\uabcd\\u1234\\u012c",
assertEscapeJava("\\uABCD\\u1234\\u012C",
"\uABCD\u1234\u012C");
}
@ -125,11 +127,26 @@ public class StringEscapeUtilsTest extends TestCase {
assertUnescapeJava("test", "test");
assertUnescapeJava("\ntest\b", "\\ntest\\b");
assertUnescapeJava("\u123425foo\ntest\b", "\\u123425foo\\ntest\\b");
//foo
assertUnescapeJava("lowercase unicode", "\uABCDx", "\\uabcdx");
assertUnescapeJava("uppercase unicode", "\uABCDx", "\\uABCDx");
assertUnescapeJava("unicode as final character", "\uABCD", "\\uabcd");
}
private void assertUnescapeJava(String unescaped, String original) throws IOException {
assertEquals("unescape(String) failed",
unescaped, StringUtils.unescape(original));
assertUnescapeJava(null, unescaped, original);
}
private void assertUnescapeJava(String message, String unescaped, String original) throws IOException {
String expected = unescaped;
String actual = StringEscapeUtils.unescapeJava(original);
assertEquals("unescape(String) failed" +
(message == null ? "" : (": " + message)) +
// we escape this so we can see it in the error message
": expected '" + StringUtils.escape(expected) +
"' actual '" + StringUtils.escape(actual) + "'",
expected, actual);
StringPrintWriter writer = new StringPrintWriter();
StringEscapeUtils.unescapeJava(writer, original);

View File

@ -69,7 +69,7 @@ import junit.textui.TestRunner;
* @author <a href="mailto:fredrik@westermarck.com>Fredrik Westermarck</a>
* @author Holger Krauth
* @author <a href="hps@intermeta.de">Henning P. Schmiedehausen</a>
* @version $Id: StringUtilsTest.java,v 1.17 2003/03/29 16:17:21 alex Exp $
* @version $Id: StringUtilsTest.java,v 1.18 2003/04/09 18:45:29 alex Exp $
*/
public class StringUtilsTest extends TestCase {
@ -432,7 +432,7 @@ public class StringUtilsTest extends TestCase {
assertEquals("escape(String) failed",
"\\u0234", StringUtils.escape("\u0234") );
assertEquals("escape(String) failed",
"\\u00fd", StringUtils.escape("\u00fd") );
"\\u00FD", StringUtils.escape("\u00fd") );
assertEquals("unescape(String) failed",
"", StringUtils.unescape("") );
assertEquals("unescape(String) failed",
@ -441,6 +441,8 @@ public class StringUtilsTest extends TestCase {
"\ntest\b", StringUtils.unescape("\\ntest\\b") );
assertEquals("unescape(String) failed",
"\u123425foo\ntest\b", StringUtils.unescape("\\u123425foo\\ntest\\b") );
assertEquals("unescape(String) failed with unicode as final char",
"\u1234", StringUtils.unescape("\\u1234") );
}
public void testGetLevenshteinDistance() {