From c5e6378b846370eb5fd99e57de8a02f075f894bd Mon Sep 17 00:00:00 2001 From: Greg Wilkins Date: Mon, 12 Sep 2011 13:14:09 +1000 Subject: [PATCH] 357338 Improve UTF-8 validation --- .../eclipse/jetty/util/Utf8Appendable.java | 157 ++++++++++-------- .../eclipse/jetty/util/Utf8StringBuffer.java | 8 +- .../eclipse/jetty/util/Utf8StringBuilder.java | 8 +- .../jetty/util/Utf8StringBufferTest.java | 8 +- .../jetty/util/Utf8StringBuilderTest.java | 58 +++++-- .../websocket/WebSocketConnectionD13.java | 20 ++- 6 files changed, 158 insertions(+), 101 deletions(-) diff --git a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java index 6e2d73f20dd..b869f431b91 100644 --- a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java +++ b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java @@ -5,9 +5,11 @@ import java.util.IllegalFormatCodePointException; public abstract class Utf8Appendable { + private final char REPLACEMENT = '\ufffd'; protected final Appendable _appendable; - protected int _more; - protected int _bits; + protected int _expectedContinuationBytes; + protected int _codePoint; + protected int _minCodePoint; public Utf8Appendable(Appendable appendable) { @@ -63,91 +65,112 @@ public abstract class Utf8Appendable protected void appendByte(byte b) throws IOException { + // Check for invalid bytes + if (b==(byte)0xc0 || b==(byte)0xc1 || (int)b>=0xf5) + { + _appendable.append(REPLACEMENT); + _expectedContinuationBytes=0; + _codePoint=0; + throw new NotUtf8Exception(); + } + + // Is it plain ASCII? if (b>=0) { - if (_more>0) + // Were we expecting a continuation byte? + if (_expectedContinuationBytes>0) { - _appendable.append('?'); - _more=0; - _bits=0; + _appendable.append(REPLACEMENT); + _expectedContinuationBytes=0; + _codePoint=0; throw new NotUtf8Exception(); } else _appendable.append((char)(0x7f&b)); } - else if (_more==0) + // Else is this a start byte + else if (_expectedContinuationBytes==0) { - if ((b&0xc0)!=0xc0) + if ((b & 0xe0) == 0xc0) { - // 10xxxxxx - _appendable.append('?'); - _more=0; - _bits=0; - throw new NotUtf8Exception(); + //110xxxxx + _expectedContinuationBytes=1; + _codePoint=b&0x1f; + _minCodePoint=0x80; + } + else if ((b & 0xf0) == 0xe0) + { + //1110xxxx + _expectedContinuationBytes=2; + _codePoint=b&0x0f; + _minCodePoint=0x800; + } + else if ((b & 0xf8) == 0xf0) + { + //11110xxx + _expectedContinuationBytes=3; + _codePoint=b&0x07; + _minCodePoint=0x10000; + } + else if ((b & 0xfc) == 0xf8) + { + //111110xx + _expectedContinuationBytes=4; + _codePoint=b&0x03; + _minCodePoint=0x200000; + } + else if ((b & 0xfe) == 0xfc) + { + //1111110x + _expectedContinuationBytes=5; + _codePoint=b&0x01; + _minCodePoint=0x400000; } else - { - if ((b & 0xe0) == 0xc0) - { - //110xxxxx - _more=1; - _bits=b&0x1f; - } - else if ((b & 0xf0) == 0xe0) - { - //1110xxxx - _more=2; - _bits=b&0x0f; - } - else if ((b & 0xf8) == 0xf0) - { - //11110xxx - _more=3; - _bits=b&0x07; - } - else if ((b & 0xfc) == 0xf8) - { - //111110xx - _more=4; - _bits=b&0x03; - } - else if ((b & 0xfe) == 0xfc) - { - //1111110x - _more=5; - _bits=b&0x01; - } - else - { - throw new NotUtf8Exception(); - } + { + _appendable.append(REPLACEMENT); + _expectedContinuationBytes=0; + _codePoint=0; + throw new NotUtf8Exception(); } } + // else is this a continuation character + else if ((b&0xc0)==0x80) + { + // 10xxxxxx + _codePoint=(_codePoint<<6)|(b&0x3f); + + // was that the last continuation? + if (--_expectedContinuationBytes==0) + { + // If this a valid unicode point? + if (_codePoint<_minCodePoint || (_codePoint>=0xD800 && _codePoint<=0xDFFF)) + { + _appendable.append(REPLACEMENT); + _expectedContinuationBytes=0; + _codePoint=0; + throw new NotUtf8Exception(); + } + + _minCodePoint=0; + char[] chars = Character.toChars(_codePoint); + for (char c : chars) + _appendable.append(c); + } + } + // Else this is not a continuation character else { - if ((b&0xc0)==0xc0) - { // 11?????? - _appendable.append('?'); - _more=0; - _bits=0; - throw new NotUtf8Exception(); - } - else - { - // 10xxxxxx - _bits=(_bits<<6)|(b&0x3f); - if (--_more==0) - { - if (_bits>=0xD800 && _bits<=0xDFFF) - throw new NotUtf8Exception(); - _appendable.append(new String(Character.toChars(_bits))); - } - } + // ! 10xxxxxx + _appendable.append(REPLACEMENT); + _expectedContinuationBytes=0; + _codePoint=0; + throw new NotUtf8Exception(); } } - public static class NotUtf8Exception extends IllegalStateException + public static class NotUtf8Exception extends IllegalArgumentException { public NotUtf8Exception() { diff --git a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java index 40de5306dce..bd730deabf3 100644 --- a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java +++ b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java @@ -53,13 +53,13 @@ public class Utf8StringBuffer extends Utf8Appendable public void reset() { _buffer.setLength(0); - _more=0; - _bits=0; + _expectedContinuationBytes=0; + _codePoint=0; } public StringBuffer getStringBuffer() { - if (_more!=0) + if (_expectedContinuationBytes!=0) throw new NotUtf8Exception(); return _buffer; } @@ -67,7 +67,7 @@ public class Utf8StringBuffer extends Utf8Appendable @Override public String toString() { - if (_more!=0) + if (_expectedContinuationBytes!=0) throw new NotUtf8Exception(); return _buffer.toString(); } diff --git a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java index 7e4477163c3..541590f6424 100644 --- a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java +++ b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java @@ -52,13 +52,13 @@ public class Utf8StringBuilder extends Utf8Appendable public void reset() { _buffer.setLength(0); - _more=0; - _bits=0; + _expectedContinuationBytes=0; + _codePoint=0; } public StringBuilder getStringBuilder() { - if (_more!=0) + if (_expectedContinuationBytes!=0) throw new NotUtf8Exception(); return _buffer; } @@ -66,7 +66,7 @@ public class Utf8StringBuilder extends Utf8Appendable @Override public String toString() { - if (_more!=0) + if (_expectedContinuationBytes!=0) throw new NotUtf8Exception(); return _buffer.toString(); } diff --git a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java index 5b0c3afae54..9c44625e8fe 100644 --- a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java +++ b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java @@ -48,9 +48,9 @@ public class Utf8StringBufferTest buffer.toString(); assertTrue(false); } - catch(IllegalStateException e) + catch(Utf8Appendable.NotUtf8Exception e) { - assertTrue(e.toString().indexOf("!UTF-8")>=0); + assertTrue(true); } } @@ -70,11 +70,11 @@ public class Utf8StringBufferTest buffer.append(bytes[i]); assertTrue(false); } - catch(IllegalStateException e) + catch(Utf8Appendable.NotUtf8Exception e) { assertTrue(e.toString().indexOf("!UTF-8")>=0); } - assertEquals("abc?",buffer.toString()); + assertEquals("abc\ufffd",buffer.toString()); } @Test diff --git a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java index 9cfca551290..bfa0cccd870 100644 --- a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java +++ b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java @@ -25,19 +25,44 @@ public class Utf8StringBuilderTest public void testInvalid() throws Exception { - Utf8StringBuilder buffer = new Utf8StringBuilder(); - buffer.append((byte)0xED); - buffer.append((byte)0xA0); - try - { - buffer.append((byte)0x80); - assertTrue(false); - } - catch(Utf8Appendable.NotUtf8Exception e) - { - assertTrue(true); - } + String[] invalids = { + "c0af", + "EDA080", + "f08080af", + "f8808080af", + "e080af", + "F4908080", + "fbbfbfbfbf" + }; + for (String i : invalids) + { + byte[] bytes = TypeUtil.fromHexString(i); + + /* Test what JVM does + try + { + String s = new String(bytes,0,bytes.length,"UTF-8"); + System.err.println(i+": "+s); + } + catch(Exception e) + { + System.err.println(i+": "+e); + } + */ + + try + { + Utf8StringBuilder buffer = new Utf8StringBuilder(); + buffer.append(bytes,0,bytes.length); + + assertEquals(i,"not expected",buffer.toString()); + } + catch(IllegalArgumentException e) + { + assertTrue(i,true); + } + } } @Test @@ -69,7 +94,7 @@ public class Utf8StringBuilderTest buffer.toString(); assertTrue(false); } - catch(IllegalStateException e) + catch(Utf8Appendable.NotUtf8Exception e) { assertTrue(e.toString().indexOf("!UTF-8")>=0); } @@ -91,11 +116,11 @@ public class Utf8StringBuilderTest buffer.append(bytes[i]); assertTrue(false); } - catch(IllegalStateException e) + catch(Utf8Appendable.NotUtf8Exception e) { - assertTrue(e.toString().indexOf("!UTF-8")>=0); + assertTrue(true); } - assertEquals("abc?", buffer.toString()); + assertEquals("abc\ufffd", buffer.toString()); } @@ -106,6 +131,7 @@ public class Utf8StringBuilderTest String source="\uD842\uDF9F"; byte[] bytes=source.getBytes("UTF-8"); + // System.err.println(TypeUtil.toHexString(bytes)); String jvmcheck = new String(bytes,0,bytes.length,"UTF-8"); assertEquals(source,jvmcheck); diff --git a/jetty-websocket/src/main/java/org/eclipse/jetty/websocket/WebSocketConnectionD13.java b/jetty-websocket/src/main/java/org/eclipse/jetty/websocket/WebSocketConnectionD13.java index bcda94485a8..9ac938c6e30 100644 --- a/jetty-websocket/src/main/java/org/eclipse/jetty/websocket/WebSocketConnectionD13.java +++ b/jetty-websocket/src/main/java/org/eclipse/jetty/websocket/WebSocketConnectionD13.java @@ -30,6 +30,7 @@ import org.eclipse.jetty.io.EndPoint; import org.eclipse.jetty.io.nio.SelectChannelEndPoint; import org.eclipse.jetty.util.B64Code; import org.eclipse.jetty.util.StringUtil; +import org.eclipse.jetty.util.TypeUtil; import org.eclipse.jetty.util.Utf8Appendable; import org.eclipse.jetty.util.Utf8StringBuilder; import org.eclipse.jetty.util.log.Log; @@ -628,6 +629,11 @@ public class WebSocketConnectionD13 extends AbstractConnection implements WebSoc { boolean lastFrame = isLastFrame(flags); + System.err.println("flags "+flags); + System.err.println("opcode "+opcode); + System.err.println("buffer "+TypeUtil.toHexString(buffer.asArray())); + + synchronized(WebSocketConnectionD13.this) { // Ignore incoming after a close @@ -827,19 +833,21 @@ public class WebSocketConnectionD13 extends AbstractConnection implements WebSoc return; } } + catch(ThreadDeath th) + { + throw th; + } catch(Utf8Appendable.NotUtf8Exception notUtf8) { LOG.warn("{} for {}",notUtf8,_endp); LOG.debug(notUtf8); errorClose(WebSocketConnectionD13.CLOSE_BAD_PAYLOAD,"Invalid UTF-8"); } - catch(ThreadDeath th) + catch(Throwable probablyNotUtf8) { - throw th; - } - catch(Throwable th) - { - LOG.warn(th); + LOG.warn("{} for {}",probablyNotUtf8,_endp); + LOG.debug(probablyNotUtf8); + errorClose(WebSocketConnectionD13.CLOSE_BAD_PAYLOAD,"Invalid Payload: "+probablyNotUtf8); } }