Escaping unicode uses capital letters e.g. \uABCD

Found and fixed bug when unicode character is at the end of a string to unescape Added unit tests for above bug to both StringUtilsTest and StringEscapeUtilsTest StringUtils.[un]escape now call StringEscapeUtils.[un]escapeJava git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137291 13f79535-47bb-0310-9956-ffa450edef68
2003-04-09 18:45:29 +00:00 · 2003-04-09 18:45:29 +00:00 · 6af3b80369
parent 2e862e0c47
commit 6af3b80369
4 changed files with 52 additions and 148 deletions
--- a/src/java/org/apache/commons/lang/StringEscapeUtils.java
+++ b/src/java/org/apache/commons/lang/StringEscapeUtils.java
@ -75,7 +75,7 @@ import org.apache.commons.lang.exception.NestableRuntimeException;
 * @author Helge Tesgaard
 * @author <a href="sean@boohai.com">Sean Brown</a>
 * @since 2.0
- * @version $Id: StringEscapeUtils.java,v 1.4 2003/04/09 00:07:49 ggregory Exp $
+ * @version $Id: StringEscapeUtils.java,v 1.5 2003/04/09 18:45:28 alex Exp $
 */
 public class StringEscapeUtils {

@ -184,11 +184,11 @@ public class StringEscapeUtils {

            // handle unicode
            if (ch > 0xfff) {
-                out.write("\\u" + Integer.toHexString(ch));
+                out.write("\\u" + hex(ch));
            } else if (ch > 0xff) {
-                out.write("\\u0" + Integer.toHexString(ch));
+                out.write("\\u0" + hex(ch));
            } else if (ch > 0x7f) {
-                out.write("\\u00" + Integer.toHexString(ch));
+                out.write("\\u00" + hex(ch));
            } else if (ch < 32) {
                switch (ch) {
                    case '\b':
@ -213,9 +213,9 @@ public class StringEscapeUtils {
                        break;
                    default :
                        if (ch > 0xf) {
-                            out.write("\\u00" + Integer.toHexString(ch));
+                            out.write("\\u00" + hex(ch));
                        } else {
-                            out.write("\\u000" + Integer.toHexString(ch));
+                            out.write("\\u000" + hex(ch));
                        }
                        break;
                }
@ -241,6 +241,10 @@ public class StringEscapeUtils {
        }
    }

+    private static String hex(char ch) {
+        return Integer.toHexString(ch).toUpperCase();
+    }
+
    /**
     * Unescapes any Java literals found in the String. For example,
     * it will turn a sequence of '\' and 'n' into a newline character,
@ -268,6 +272,7 @@ public class StringEscapeUtils {
            if (inUnicode) {
                // if in unicode, then we're reading unicode
                // values in somehow
+                unicode.append(ch);
                if (unicode.length() == 4) {
                    // unicode now contains the four hex digits
                    // which represents our unicode chacater
@ -275,16 +280,13 @@ public class StringEscapeUtils {
                        int value = Integer.parseInt(unicode.toString(), 16);
                        out.write((char) value);
                        unicode.setLength(0);
-                        unicode.setLength(4);
                        inUnicode = false;
                        hadSlash = false;
                    } catch (NumberFormatException nfe) {
                        throw new NestableRuntimeException("Unable to parse unicode value: " + unicode, nfe);
                    }
-                } else {
-                    unicode.append(ch);
-                    continue;
                }
+                continue;
            }
            if (hadSlash) {
                // handle an escaped value
--- a/src/java/org/apache/commons/lang/StringUtils.java
+++ b/src/java/org/apache/commons/lang/StringUtils.java
@ -78,7 +78,7 @@ import org.apache.commons.lang.exception.NestableRuntimeException;
 * @author <a href="mailto:hps@intermeta.de">Henning P. Schmiedehausen</a>
 * @author Arun Mammen Thomas
 * @since 1.0
- * @version $Id: StringUtils.java,v 1.41 2003/04/09 00:07:50 ggregory Exp $
+ * @version $Id: StringUtils.java,v 1.42 2003/04/09 18:45:29 alex Exp $
 */
 public class StringUtils {

@ -1140,147 +1140,30 @@ public class StringUtils {
     * <p>So a tab becomes the characters <code>'\\'</code> and
     * <code>'t'</code>.</p>
     *
+     * <p>As of Lang 2.0, this calls {@link StringEscapeUtils#escapeJava(java.lang.String)}
+     * behind the scenes.  For convenience, this method is not deprecated.
+     * </p>
+     * @see StringEscapeUtils#escapeJava(java.lang.String)
     * @param str String to escape values in
     * @return String with escaped values
     * @throws NullPointerException if str is <code>null</code>
     */
    public static String escape(String str) {
-        // improved with code from  cybertiger@cyberiantiger.org
-        // unicode from him, and defaul for < 32's.
-        int sz = str.length();
-        StringBuffer buffer = new StringBuffer(2 * sz);
-        for (int i = 0; i < sz; i++) {
-            char ch = str.charAt(i);
-
-            // handle unicode
-            if (ch > 0xfff) {
-                buffer.append("\\u" + Integer.toHexString(ch));
-            } else if (ch > 0xff) {
-                buffer.append("\\u0" + Integer.toHexString(ch));
-            } else if (ch > 0x7f) {
-                buffer.append("\\u00" + Integer.toHexString(ch));
-            } else if (ch < 32) {
-                switch (ch) {
-                    case '\b' :
-                        buffer.append('\\');
-                        buffer.append('b');
-                        break;
-                    case '\n' :
-                        buffer.append('\\');
-                        buffer.append('n');
-                        break;
-                    case '\t' :
-                        buffer.append('\\');
-                        buffer.append('t');
-                        break;
-                    case '\f' :
-                        buffer.append('\\');
-                        buffer.append('f');
-                        break;
-                    case '\r' :
-                        buffer.append('\\');
-                        buffer.append('r');
-                        break;
-                    default :
-                        if (ch > 0xf) {
-                            buffer.append("\\u00" + Integer.toHexString(ch));
-                        } else {
-                            buffer.append("\\u000" + Integer.toHexString(ch));
-                        }
-                        break;
-                }
-            } else {
-                switch (ch) {
-                    case '\'' :
-                        buffer.append('\\');
-                        buffer.append('\'');
-                        break;
-                    case '"' :
-                        buffer.append('\\');
-                        buffer.append('"');
-                        break;
-                    case '\\' :
-                        buffer.append('\\');
-                        buffer.append('\\');
-                        break;
-                    default :
-                        buffer.append(ch);
-                        break;
-                }
-            }
-        }
-        return buffer.toString();
+        return StringEscapeUtils.escapeJava(str);
    }

    /**
     * Unescapes any Java literals found in the String. For example, 
     * it will turn a sequence of '\' and 'n' into a newline character, 
     * unless the '\' is preceded by another '\'.
+     * <p>
+     * As of Lang 2.0, this calls {@link StringEscapeUtils#unescapeJava(java.lang.String)}
+     * behind the scenes.  For convenience, this method is not deprecated.
+     * <p>
+     * @see StringEscapeUtils#unescapeJava(java.lang.String)
     */
    public static String unescape(String str) {
-        int sz = str.length();
-        StringBuffer buffer = new StringBuffer(sz);
-        StringBuffer unicode = new StringBuffer(4);
-        boolean hadSlash = false;
-        boolean inUnicode = false;
-        for (int i = 0; i < sz; i++) {
-            char ch = str.charAt(i);
-            if(inUnicode) {
-                // if in unicode, then we're reading unicode 
-                // values in somehow
-                if(unicode.length() == 4) {
-                    // unicode now contains the four hex digits 
-                    // which represents our unicode chacater
-                    try {
-                        int value = Integer.parseInt(unicode.toString(), 16);
-                        buffer.append( (char)value );
-                        unicode.setLength(0);
-                        unicode.setLength(4);
-                        inUnicode = false;
-                        hadSlash = false;
-                    } catch(NumberFormatException nfe) {
-                        throw new NestableRuntimeException("Unable to parse unicode value: "+unicode, nfe);
-                    }
-                } else {
-                    unicode.append(ch);
-                    continue;
-                }
-            }
-            if(hadSlash) {
-                // handle an escaped value
-                hadSlash = false;
-                switch(ch) {
-                    case '\\': buffer.append('\\'); break;
-                    case '\'': buffer.append('\''); break;
-                    case '\"': buffer.append('"'); break;
-                    case 'r':  buffer.append('\r'); break;
-                    case 'f':  buffer.append('\f'); break;
-                    case 't':  buffer.append('\t'); break;
-                    case 'n':  buffer.append('\n'); break;
-                    case 'b':  buffer.append('\b'); break;
-                    case 'u':  {
-                        // uh-oh, we're in unicode country....
-                        inUnicode=true;
-                        break;
-                    }
-                    default :
-                        buffer.append(ch);
-                        break;
-                }
-                continue;
-            } else
-            if(ch == '\\') {
-                hadSlash = true;
-                continue;
-            } 
-            buffer.append(ch);
-        }
-        if(hadSlash) {
-            // then we're in the weird case of a \ at the end of the 
-            // string, let's output it anyway.
-            buffer.append('\\');
-        }
-        return buffer.toString();
+        return StringEscapeUtils.unescapeJava(str);
    }

    // Padding
--- a/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java
+++ b/src/test/org/apache/commons/lang/StringEscapeUtilsTest.java
@ -62,11 +62,11 @@ import junit.framework.TestSuite;
 import junit.textui.TestRunner;

 /**
- * Unit tests {@link StringUtils}.
+ * Unit tests for {@link StringEscapeUtils}.
 *
 * @author of original StringUtilsTest.testEscape = ?
 * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
- * @version $Id: StringEscapeUtilsTest.java,v 1.2 2003/04/09 17:30:29 alex Exp $
+ * @version $Id: StringEscapeUtilsTest.java,v 1.3 2003/04/09 18:45:29 alex Exp $
 */
 public class StringEscapeUtilsTest extends TestCase {
    private final static String FOO = "foo";
@ -96,13 +96,15 @@ public class StringEscapeUtilsTest extends TestCase {
        assertEscapeJava("\\\\\\b\\t\\r", "\\\b\t\r");
        assertEscapeJava("\\u1234", "\u1234");
        assertEscapeJava("\\u0234", "\u0234");
-        assertEscapeJava("\\u00fd", "\u00fd");
+        assertEscapeJava("\\u00EF", "\u00ef");
+        assertEscapeJava("\\u0001", "\u0001");
+        assertEscapeJava("Should use capitalized unicode hex", "\\uABCD", "\uabcd");

        assertEscapeJava("He didn't say, \\\"stop!\\\"",
                "He didn't say, \"stop!\"");
-        assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00a0",
+        assertEscapeJava("non-breaking space", "This space is non-breaking:" + "\\u00A0",
                "This space is non-breaking:\u00a0");
-        assertEscapeJava("\\uabcd\\u1234\\u012c",
+        assertEscapeJava("\\uABCD\\u1234\\u012C",
                "\uABCD\u1234\u012C");
    }

@ -125,11 +127,26 @@ public class StringEscapeUtilsTest extends TestCase {
        assertUnescapeJava("test", "test");
        assertUnescapeJava("\ntest\b", "\\ntest\\b");
        assertUnescapeJava("\u123425foo\ntest\b", "\\u123425foo\\ntest\\b");
+        //foo
+        assertUnescapeJava("lowercase unicode", "\uABCDx", "\\uabcdx");
+        assertUnescapeJava("uppercase unicode", "\uABCDx", "\\uABCDx");
+        assertUnescapeJava("unicode as final character", "\uABCD", "\\uabcd");
    }

    private void assertUnescapeJava(String unescaped, String original) throws IOException {
-        assertEquals("unescape(String) failed",
-                unescaped, StringUtils.unescape(original));
+        assertUnescapeJava(null, unescaped, original);
+    }
+
+    private void assertUnescapeJava(String message, String unescaped, String original) throws IOException {
+        String expected = unescaped;
+        String actual = StringEscapeUtils.unescapeJava(original);
+
+        assertEquals("unescape(String) failed" +
+                (message == null ? "" : (": " + message)) +
+                // we escape this so we can see it in the error message
+                ": expected '" + StringUtils.escape(expected) +
+                "' actual '" + StringUtils.escape(actual) + "'",
+                expected, actual);

        StringPrintWriter writer = new StringPrintWriter();
        StringEscapeUtils.unescapeJava(writer, original);
--- a/src/test/org/apache/commons/lang/StringUtilsTest.java
+++ b/src/test/org/apache/commons/lang/StringUtilsTest.java
@ -69,7 +69,7 @@ import junit.textui.TestRunner;
 * @author <a href="mailto:fredrik@westermarck.com>Fredrik Westermarck</a>
 * @author Holger Krauth
 * @author <a href="hps@intermeta.de">Henning P. Schmiedehausen</a>
- * @version $Id: StringUtilsTest.java,v 1.17 2003/03/29 16:17:21 alex Exp $
+ * @version $Id: StringUtilsTest.java,v 1.18 2003/04/09 18:45:29 alex Exp $
 */
 public class StringUtilsTest extends TestCase {

@ -432,7 +432,7 @@ public class StringUtilsTest extends TestCase {
        assertEquals("escape(String) failed",
                     "\\u0234", StringUtils.escape("\u0234") );
        assertEquals("escape(String) failed",
-                     "\\u00fd", StringUtils.escape("\u00fd") );
+                     "\\u00FD", StringUtils.escape("\u00fd") );
        assertEquals("unescape(String) failed",
                     "", StringUtils.unescape("") );
        assertEquals("unescape(String) failed",
@ -441,6 +441,8 @@ public class StringUtilsTest extends TestCase {
                     "\ntest\b", StringUtils.unescape("\\ntest\\b") );
        assertEquals("unescape(String) failed",
                     "\u123425foo\ntest\b", StringUtils.unescape("\\u123425foo\\ntest\\b") );
+        assertEquals("unescape(String) failed with unicode as final char",
+                     "\u1234", StringUtils.unescape("\\u1234") );
    }

    public void testGetLevenshteinDistance() {