Issue #4033 - lenient percent decode in URIUtil

+ Allows for preserving decoded Strings like "X%YZ"

Signed-off-by: Joakim Erdfelt <joakim.erdfelt@gmail.com>
This commit is contained in:
Joakim Erdfelt 2019-08-28 12:08:18 -05:00
parent b2ea6a0861
commit 7da57151ed
3 changed files with 138 additions and 15 deletions

View File

@ -644,6 +644,21 @@ public class StringUtil
return __UTF8.equalsIgnoreCase(charset) || __UTF8.equalsIgnoreCase(normalizeCharset(charset));
}
public static boolean isHex(String str, int offset, int length)
{
for (int i = offset; i < (offset + length); i++)
{
char c = str.charAt(i);
if (!(((c >= 'a') && (c <= 'f')) ||
((c >= 'A') && (c <= 'F')) ||
((c >= '0') && (c <= '9'))))
{
return false;
}
}
return true;
}
public static String printable(String name)
{
if (name == null)

View File

@ -470,24 +470,72 @@ public class URIUtil
builder = new Utf8StringBuilder(path.length());
builder.append(path, offset, i - offset);
}
if ((i + 2) < end)
// lenient percent decoding
if (i >= end)
{
char u = path.charAt(i + 1);
if (u == 'u')
// [LENIENT] a percent sign at end of string.
builder.append('%');
i = end;
}
else if (end > (i + 1))
{
char type = path.charAt(i + 1);
if (type == 'u')
{
// TODO this is wrong. This is a codepoint not a char
builder.append((char)(0xffff & TypeUtil.parseInt(path, i + 2, 4, 16)));
i += 5;
// We have a possible (deprecated) microsoft unicode code point "%u####"
// - not recommended to use as it's limited to 2 bytes.
if ((i + 5) >= end)
{
// [LENIENT] we have a partial "%u####" at the end of a string.
builder.append(path, i, (end - i));
i = end;
}
else
{
// this seems wrong, as we are casting to a char, but that's the known
// limitation of this deprecated encoding (only 2 bytes allowed)
if (StringUtil.isHex(path, i + 2, 4))
{
builder.append((char)(0xffff & TypeUtil.parseInt(path, i + 2, 4, 16)));
i += 5;
}
else
{
// [LENIENT] copy the "%u" as-is.
builder.append(path, i, 2);
i += 1;
}
}
}
else if (end > (i + 2))
{
// we have a possible "%##" encoding
if (StringUtil.isHex(path, i + 1, 2))
{
char c1 = path.charAt(i + 1);
char c2 = path.charAt(i + 2);
builder.append((byte)(0xff & (TypeUtil.convertHexDigit(c1) * 16 + TypeUtil.convertHexDigit(c2))));
i += 2;
}
else
{
builder.append(path, i, 3);
i += 2;
}
}
else
{
builder.append((byte)(0xff & (TypeUtil.convertHexDigit(u) * 16 + TypeUtil.convertHexDigit(path.charAt(i + 2)))));
i += 2;
// [LENIENT] incomplete "%##" sequence at end of string
builder.append(path, i, (end - i));
i = end;
}
}
else
{
throw new IllegalArgumentException("Bad URI % encoding");
// [LENIENT] the "%" at the end of the string
builder.append(path, i, (end - i));
i = end;
}
break;
@ -1156,12 +1204,32 @@ public class URIUtil
int oa = uriA.charAt(a++);
int ca = oa;
if (ca == '%')
ca = TypeUtil.convertHexDigit(uriA.charAt(a++)) * 16 + TypeUtil.convertHexDigit(uriA.charAt(a++));
{
ca = lenientPercentDecode(uriA, a);
if (ca == (-1))
{
ca = '%';
}
else
{
a += 2;
}
}
int ob = uriB.charAt(b++);
int cb = ob;
if (cb == '%')
cb = TypeUtil.convertHexDigit(uriB.charAt(b++)) * 16 + TypeUtil.convertHexDigit(uriB.charAt(b++));
{
cb = lenientPercentDecode(uriB, b);
if (cb == (-1))
{
cb = '%';
}
else
{
b += 2;
}
}
if (ca == '/' && oa != ob)
return false;
@ -1172,6 +1240,23 @@ public class URIUtil
return a == lenA && b == lenB;
}
private static int lenientPercentDecode(String str, int offset)
{
if (offset >= str.length())
return -1;
char a1 = str.charAt(offset);
char a2 = str.charAt(offset + 1);
try
{
return TypeUtil.convertHexDigit(a1) * 16 + TypeUtil.convertHexDigit(a2);
}
catch (NumberFormatException e)
{
return -1;
}
}
public static boolean equalsIgnoreEncodings(URI uriA, URI uriB)
{
if (uriA.equals(uriB))

View File

@ -90,9 +90,22 @@ public class URIUtilTest
// Test for null character (real world ugly test case)
byte[] oddBytes = {'/', 0x00, '/'};
String odd = new String(oddBytes, StandardCharsets.ISO_8859_1);
assertEquals(odd, URIUtil.decodePath("/%00/"));
arguments.add(Arguments.of("/%00/", odd));
// Deprecated Microsoft Percent-U encoding
arguments.add(Arguments.of("abc%u3040", "abc\u3040"));
// Lenient decode
arguments.add(Arguments.of("abc%xyz", "abc%xyz")); // not a "%##"
arguments.add(Arguments.of("abc%", "abc%")); // percent at end of string
arguments.add(Arguments.of("abc%A", "abc%A")); // incomplete "%##" at end of string
arguments.add(Arguments.of("abc%uvwxyz", "abc%uvwxyz")); // not a valid "%u####"
arguments.add(Arguments.of("abc%uEFGHIJ", "abc%uEFGHIJ")); // not a valid "%u####"
arguments.add(Arguments.of("abc%uABC", "abc%uABC")); // incomplete "%u####"
arguments.add(Arguments.of("abc%uAB", "abc%uAB")); // incomplete "%u####"
arguments.add(Arguments.of("abc%uA", "abc%uA")); // incomplete "%u####"
arguments.add(Arguments.of("abc%u", "abc%u")); // incomplete "%u####"
return arguments.stream();
}
@ -344,7 +357,11 @@ public class URIUtilTest
Arguments.of("/b rry%27s", "/b%20rry%27s"),
Arguments.of("/foo%2fbar", "/foo%2fbar"),
Arguments.of("/foo%2fbar", "/foo%2Fbar")
Arguments.of("/foo%2fbar", "/foo%2Fbar"),
// encoded vs not-encode ("%" symbol is encoded as "%25")
Arguments.of("/abc%25xyz", "/abc%xyz"),
Arguments.of("/zzz%25", "/zzz%")
);
}
@ -358,11 +375,17 @@ public class URIUtilTest
public static Stream<Arguments> equalsIgnoreEncodingStringFalseSource()
{
return Stream.of(
// case difference
Arguments.of("ABC", "abc"),
// Encoding difference ("'" is "%27")
Arguments.of("/barry's", "/barry%26s"),
// Never match on "%2f" differences
Arguments.of("/foo/bar", "/foo%2fbar"),
Arguments.of("/foo2fbar", "/foo/bar")
// not actually encoded
Arguments.of("/foo2fbar", "/foo/bar"),
// encoded vs not-encode ("%" symbol is encoded as "%25")
Arguments.of("/yyy%25zzz", "/aaa%xxx"),
Arguments.of("/zzz%25", "/aaa%")
);
}