Escaping unicode uses capital letters e.g. \uABCD
Found and fixed bug when unicode character is at the end of a string to unescape Added unit tests for above bug to both StringUtilsTest and StringEscapeUtilsTest StringUtils.[un]escape now call StringEscapeUtils.[un]escapeJava git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137291 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2e862e0c47
commit
6af3b80369
|
@ -75,7 +75,7 @@ import org.apache.commons.lang.exception.NestableRuntimeException;
|
|||
* @author Helge Tesgaard
|
||||
* @author <a href="sean@boohai.com">Sean Brown</a>
|
||||
* @since 2.0
|
||||
* @version $Id: StringEscapeUtils.java,v 1.4 2003/04/09 00:07:49 ggregory Exp $
|
||||
* @version $Id: StringEscapeUtils.java,v 1.5 2003/04/09 18:45:28 alex Exp $
|
||||
*/
|
||||
public class StringEscapeUtils {
|
||||
|
||||
|
@ -184,11 +184,11 @@ public class StringEscapeUtils {
|
|||
|
||||
// handle unicode
|
||||
if (ch > 0xfff) {
|
||||
out.write("\\u" + Integer.toHexString(ch));
|
||||
out.write("\\u" + hex(ch));
|
||||
} else if (ch > 0xff) {
|
||||
out.write("\\u0" + Integer.toHexString(ch));
|
||||
out.write("\\u0" + hex(ch));
|
||||
} else if (ch > 0x7f) {
|
||||
out.write("\\u00" + Integer.toHexString(ch));
|
||||
out.write("\\u00" + hex(ch));
|
||||
} else if (ch < 32) {
|
||||
switch (ch) {
|
||||
case '\b':
|
||||
|
@ -213,9 +213,9 @@ public class StringEscapeUtils {
|
|||
break;
|
||||
default :
|
||||
if (ch > 0xf) {
|
||||
out.write("\\u00" + Integer.toHexString(ch));
|
||||
out.write("\\u00" + hex(ch));
|
||||
} else {
|
||||
out.write("\\u000" + Integer.toHexString(ch));
|
||||
out.write("\\u000" + hex(ch));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -241,6 +241,10 @@ public class StringEscapeUtils {
|
|||
}
|
||||
}
|
||||
|
||||
private static String hex(char ch) {
|
||||
return Integer.toHexString(ch).toUpperCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unescapes any Java literals found in the String. For example,
|
||||
* it will turn a sequence of '\' and 'n' into a newline character,
|
||||
|
@ -268,6 +272,7 @@ public class StringEscapeUtils {
|
|||
if (inUnicode) {
|
||||
// if in unicode, then we're reading unicode
|
||||
// values in somehow
|
||||
unicode.append(ch);
|
||||
if (unicode.length() == 4) {
|
||||
// unicode now contains the four hex digits
|
||||
// which represents our unicode chacater
|
||||
|
@ -275,16 +280,13 @@ public class StringEscapeUtils {
|
|||
int value = Integer.parseInt(unicode.toString(), 16);
|
||||
out.write((char) value);
|
||||
unicode.setLength(0);
|
||||
unicode.setLength(4);
|
||||
inUnicode = false;
|
||||
hadSlash = false;
|
||||
} catch (NumberFormatException nfe) {
|
||||
throw new NestableRuntimeException("Unable to parse unicode value: " + unicode, nfe);
|
||||
}
|
||||
} else {
|
||||
unicode.append(ch);
|
||||
continue;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (hadSlash) {
|
||||
// handle an escaped value
|
||||
|
|
|
@ -78,7 +78,7 @@ import org.apache.commons.lang.exception.NestableRuntimeException;
|
|||
* @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
|
||||
* @author Arun Mammen Thomas
|
||||
* @since 1.0
|
||||
* @version $Id: StringUtils.java,v 1.41 2003/04/09 00:07:50 ggregory Exp $
|
||||
* @version $Id: StringUtils.java,v 1.42 2003/04/09 18:45:29 alex Exp $
|
||||
*/
|
||||
public class StringUtils {
|
||||
|
||||
|
@ -1140,147 +1140,30 @@ public class StringUtils {
|
|||
* <p>So a tab becomes the characters <code>'\\'</code> and
|
||||
* <code>'t'</code>.</p>
|
||||
*
|
||||
* <p>As of Lang 2.0, this calls {@link StringEscapeUtils#escapeJava(java.lang.String)}
|
||||
* behind the scenes. For convenience, this method is not deprecated.
|
||||
* </p>
|
||||
* @see StringEscapeUtils#escapeJava(java.lang.String)
|
||||
* @param str String to escape values in
|
||||
* @return String with escaped values
|
||||
* @throws NullPointerException if str is <code>null</code>
|
||||
*/
|
||||
public static String escape(String str) {
|
||||
// improved with code from cybertiger@cyberiantiger.org
|
||||
// unicode from him, and defaul for < 32's.
|
||||
int sz = str.length();
|
||||
StringBuffer buffer = new StringBuffer(2 * sz);
|
||||
for (int i = 0; i < sz; i++) {
|
||||
char ch = str.charAt(i);
|
||||
|
||||
// handle unicode
|
||||
if (ch > 0xfff) {
|
||||
buffer.append("\\u" + Integer.toHexString(ch));
|
||||
} else if (ch > 0xff) {
|
||||
buffer.append("\\u0" + Integer.toHexString(ch));
|
||||
} else if (ch > 0x7f) {
|
||||
buffer.append("\\u00" + Integer.toHexString(ch));
|
||||
} else if (ch < 32) {
|
||||
switch (ch) {
|
||||
case '\b' :
|
||||
buffer.append('\\');
|
||||
buffer.append('b');
|
||||
break;
|
||||
case '\n' :
|
||||
buffer.append('\\');
|
||||
buffer.append('n');
|
||||
break;
|
||||
case '\t' :
|
||||
buffer.append('\\');
|
||||
buffer.append('t');
|
||||
break;
|
||||
case '\f' :
|
||||
buffer.append('\\');
|
||||
buffer.append('f');
|
||||
break;
|
||||
case '\r' :
|
||||
buffer.append('\\');
|
||||
buffer.append('r');
|
||||
break;
|
||||
default :
|
||||
if (ch > 0xf) {
|
||||
buffer.append("\\u00" + Integer.toHexString(ch));
|
||||
} else {
|
||||
buffer.append("\\u000" + Integer.toHexString(ch));
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
switch (ch) {
|
||||
case '\'' :
|
||||
buffer.append('\\');
|
||||
buffer.append('\'');
|
||||
break;
|
||||
case '"' :
|
||||
buffer.append('\\');
|
||||
buffer.append('"');
|
||||
break;
|
||||
case '\\' :
|
||||
buffer.append('\\');
|
||||
buffer.append('\\');
|
||||
break;
|
||||
default :
|
||||
buffer.append(ch);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return buffer.toString();
|
||||
return StringEscapeUtils.escapeJava(str);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unescapes any Java literals found in the String. For example,
|
||||
* it will turn a sequence of '\' and 'n' into a newline character,
|
||||
* unless the '\' is preceded by another '\'.
|
||||
* <p>
|
||||
* As of Lang 2.0, this calls {@link StringEscapeUtils#unescapeJava(java.lang.String)}
|
||||
* behind the scenes. For convenience, this method is not deprecated.
|
||||
* <p>
|
||||
* @see StringEscapeUtils#unescapeJava(java.lang.String)
|
||||
*/
|
||||
public static String unescape(String str) {
|
||||
int sz = str.length();
|
||||
StringBuffer buffer = new StringBuffer(sz);
|
||||
StringBuffer unicode = new StringBuffer(4);
|
||||
boolean hadSlash = false;
|
||||
boolean inUnicode = false;
|
||||
for (int i = 0; i < sz; i++) {
|
||||
char ch = str.charAt(i);
|
||||
if(inUnicode) {
|
||||
// if in unicode, then we're reading unicode
|
||||
// values in somehow
|
||||
if(unicode.length() == 4) {
|
||||
// unicode now contains the four hex digits
|
||||
// which represents our unicode chacater
|
||||
try {
|
||||
int value = Integer.parseInt(unicode.toString(), 16);
|
||||
buffer.append( (char)value );
|
||||
unicode.setLength(0);
|
||||
unicode.setLength(4);
|
||||
inUnicode = false;
|
||||
hadSlash = false;
|
||||
} catch(NumberFormatException nfe) {
|
||||
throw new NestableRuntimeException("Unable to parse unicode value: "+unicode, nfe);
|
||||
}
|
||||
} else {
|
||||
unicode.append(ch);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(hadSlash) {
|
||||
// handle an escaped value
|
||||
hadSlash = false;
|
||||
switch(ch) {
|
||||
case '\\': buffer.append('\\'); break;
|
||||
case '\'': buffer.append('\''); break;
|
||||
case '\"': buffer.append('"'); break;
|
||||
case 'r': buffer.append('\r'); break;
|
||||
case 'f': buffer.append('\f'); break;
|
||||
case 't': buffer.append('\t'); break;
|
||||
case 'n': buffer.append('\n'); break;
|
||||
case 'b': buffer.append('\b'); break;
|
||||
case 'u': {
|
||||
// uh-oh, we're in unicode country....
|
||||
inUnicode=true;
|
||||
break;
|
||||
}
|
||||
default :
|
||||
buffer.append(ch);
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
} else
|
||||
if(ch == '\\') {
|
||||
hadSlash = true;
|
||||
continue;
|
||||
}
|
||||
buffer.append(ch);
|
||||
}
|
||||
if(hadSlash) {
|
||||
// then we're in the weird case of a \ at the end of the
|
||||
// string, let's output it anyway.
|
||||
buffer.append('\\');
|
||||
}
|
||||
return buffer.toString();
|
||||
return StringEscapeUtils.unescapeJava(str);
|
||||
}
|
||||
|
||||
// Padding
|
||||
|
|
|
@ -62,11 +62,11 @@ import junit.framework.TestSuite;
|
|||
import junit.textui.TestRunner;
|
||||
|
||||
/**
|
||||
* Unit tests {@link StringUtils}.
|
||||
* Unit tests for {@link StringEscapeUtils}.
|
||||
*
|
||||
* @author of original StringUtilsTest.testEscape = ?
|
||||
* @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
|
||||
* @version $Id: StringEscapeUtilsTest.java,v 1.2 2003/04/09 17:30:29 alex Exp $
|
||||
* @version $Id: StringEscapeUtilsTest.java,v 1.3 2003/04/09 18:45:29 alex Exp $
|
||||
*/
|
||||
public class StringEscapeUtilsTest extends TestCase {
|
||||
private final static String FOO = "foo";
|
||||
|
@ -96,13 +96,15 @@ public class StringEscapeUtilsTest extends TestCase {
|
|||
assertEscapeJava("\\\\\\b\\t\\r", "\\\b\t\r");
|
||||
assertEscapeJava("\\u1234", "\u1234");
|
||||
assertEscapeJava("\\u0234", "\u0234");
|
||||
assertEscapeJava("\\u00fd", "\u00fd");
|
||||
assertEscapeJava("\\u00EF", "\u00ef");
|
||||
assertEscapeJava("\\u0001", "\u0001");
|
||||
assertEscapeJava("Should use capitalized unicode hex", "\\uABCD", "\uabcd");
|
||||
|
||||
assertEscapeJava("He didn't say, \\\"stop!\\\"",
|
||||
"He didn't say, \"stop!\"");
|
||||
assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00a0",
|
||||
assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00A0",
|
||||
"This space is non-breaking:\u00a0");
|
||||
assertEscapeJava("\\uabcd\\u1234\\u012c",
|
||||
assertEscapeJava("\\uABCD\\u1234\\u012C",
|
||||
"\uABCD\u1234\u012C");
|
||||
}
|
||||
|
||||
|
@ -125,11 +127,26 @@ public class StringEscapeUtilsTest extends TestCase {
|
|||
assertUnescapeJava("test", "test");
|
||||
assertUnescapeJava("\ntest\b", "\\ntest\\b");
|
||||
assertUnescapeJava("\u123425foo\ntest\b", "\\u123425foo\\ntest\\b");
|
||||
//foo
|
||||
assertUnescapeJava("lowercase unicode", "\uABCDx", "\\uabcdx");
|
||||
assertUnescapeJava("uppercase unicode", "\uABCDx", "\\uABCDx");
|
||||
assertUnescapeJava("unicode as final character", "\uABCD", "\\uabcd");
|
||||
}
|
||||
|
||||
private void assertUnescapeJava(String unescaped, String original) throws IOException {
|
||||
assertEquals("unescape(String) failed",
|
||||
unescaped, StringUtils.unescape(original));
|
||||
assertUnescapeJava(null, unescaped, original);
|
||||
}
|
||||
|
||||
private void assertUnescapeJava(String message, String unescaped, String original) throws IOException {
|
||||
String expected = unescaped;
|
||||
String actual = StringEscapeUtils.unescapeJava(original);
|
||||
|
||||
assertEquals("unescape(String) failed" +
|
||||
(message == null ? "" : (": " + message)) +
|
||||
// we escape this so we can see it in the error message
|
||||
": expected '" + StringUtils.escape(expected) +
|
||||
"' actual '" + StringUtils.escape(actual) + "'",
|
||||
expected, actual);
|
||||
|
||||
StringPrintWriter writer = new StringPrintWriter();
|
||||
StringEscapeUtils.unescapeJava(writer, original);
|
||||
|
|
|
@ -69,7 +69,7 @@ import junit.textui.TestRunner;
|
|||
* @author <a href="mailto:fredrik@westermarck.com>Fredrik Westermarck</a>
|
||||
* @author Holger Krauth
|
||||
* @author <a href="hps@intermeta.de">Henning P. Schmiedehausen</a>
|
||||
* @version $Id: StringUtilsTest.java,v 1.17 2003/03/29 16:17:21 alex Exp $
|
||||
* @version $Id: StringUtilsTest.java,v 1.18 2003/04/09 18:45:29 alex Exp $
|
||||
*/
|
||||
public class StringUtilsTest extends TestCase {
|
||||
|
||||
|
@ -432,7 +432,7 @@ public class StringUtilsTest extends TestCase {
|
|||
assertEquals("escape(String) failed",
|
||||
"\\u0234", StringUtils.escape("\u0234") );
|
||||
assertEquals("escape(String) failed",
|
||||
"\\u00fd", StringUtils.escape("\u00fd") );
|
||||
"\\u00FD", StringUtils.escape("\u00fd") );
|
||||
assertEquals("unescape(String) failed",
|
||||
"", StringUtils.unescape("") );
|
||||
assertEquals("unescape(String) failed",
|
||||
|
@ -441,6 +441,8 @@ public class StringUtilsTest extends TestCase {
|
|||
"\ntest\b", StringUtils.unescape("\\ntest\\b") );
|
||||
assertEquals("unescape(String) failed",
|
||||
"\u123425foo\ntest\b", StringUtils.unescape("\\u123425foo\\ntest\\b") );
|
||||
assertEquals("unescape(String) failed with unicode as final char",
|
||||
"\u1234", StringUtils.unescape("\\u1234") );
|
||||
}
|
||||
|
||||
public void testGetLevenshteinDistance() {
|
||||
|
|
Loading…
Reference in New Issue