357338 Improve UTF-8 validation

This commit is contained in:
Greg Wilkins 2011-09-12 13:14:09 +10:00
parent eab970e5c0
commit c5e6378b84
6 changed files with 158 additions and 101 deletions

View File

@ -5,9 +5,11 @@ import java.util.IllegalFormatCodePointException;
public abstract class Utf8Appendable public abstract class Utf8Appendable
{ {
private final char REPLACEMENT = '\ufffd';
protected final Appendable _appendable; protected final Appendable _appendable;
protected int _more; protected int _expectedContinuationBytes;
protected int _bits; protected int _codePoint;
protected int _minCodePoint;
public Utf8Appendable(Appendable appendable) public Utf8Appendable(Appendable appendable)
{ {
@ -63,91 +65,112 @@ public abstract class Utf8Appendable
protected void appendByte(byte b) throws IOException protected void appendByte(byte b) throws IOException
{ {
// Check for invalid bytes
if (b==(byte)0xc0 || b==(byte)0xc1 || (int)b>=0xf5)
{
_appendable.append(REPLACEMENT);
_expectedContinuationBytes=0;
_codePoint=0;
throw new NotUtf8Exception();
}
// Is it plain ASCII?
if (b>=0) if (b>=0)
{ {
if (_more>0) // Were we expecting a continuation byte?
if (_expectedContinuationBytes>0)
{ {
_appendable.append('?'); _appendable.append(REPLACEMENT);
_more=0; _expectedContinuationBytes=0;
_bits=0; _codePoint=0;
throw new NotUtf8Exception(); throw new NotUtf8Exception();
} }
else else
_appendable.append((char)(0x7f&b)); _appendable.append((char)(0x7f&b));
} }
else if (_more==0) // Else is this a start byte
else if (_expectedContinuationBytes==0)
{ {
if ((b&0xc0)!=0xc0) if ((b & 0xe0) == 0xc0)
{ {
// 10xxxxxx //110xxxxx
_appendable.append('?'); _expectedContinuationBytes=1;
_more=0; _codePoint=b&0x1f;
_bits=0; _minCodePoint=0x80;
throw new NotUtf8Exception(); }
else if ((b & 0xf0) == 0xe0)
{
//1110xxxx
_expectedContinuationBytes=2;
_codePoint=b&0x0f;
_minCodePoint=0x800;
}
else if ((b & 0xf8) == 0xf0)
{
//11110xxx
_expectedContinuationBytes=3;
_codePoint=b&0x07;
_minCodePoint=0x10000;
}
else if ((b & 0xfc) == 0xf8)
{
//111110xx
_expectedContinuationBytes=4;
_codePoint=b&0x03;
_minCodePoint=0x200000;
}
else if ((b & 0xfe) == 0xfc)
{
//1111110x
_expectedContinuationBytes=5;
_codePoint=b&0x01;
_minCodePoint=0x400000;
} }
else else
{ {
if ((b & 0xe0) == 0xc0) _appendable.append(REPLACEMENT);
{ _expectedContinuationBytes=0;
//110xxxxx _codePoint=0;
_more=1; throw new NotUtf8Exception();
_bits=b&0x1f;
}
else if ((b & 0xf0) == 0xe0)
{
//1110xxxx
_more=2;
_bits=b&0x0f;
}
else if ((b & 0xf8) == 0xf0)
{
//11110xxx
_more=3;
_bits=b&0x07;
}
else if ((b & 0xfc) == 0xf8)
{
//111110xx
_more=4;
_bits=b&0x03;
}
else if ((b & 0xfe) == 0xfc)
{
//1111110x
_more=5;
_bits=b&0x01;
}
else
{
throw new NotUtf8Exception();
}
} }
} }
// else is this a continuation character
else if ((b&0xc0)==0x80)
{
// 10xxxxxx
_codePoint=(_codePoint<<6)|(b&0x3f);
// was that the last continuation?
if (--_expectedContinuationBytes==0)
{
// If this a valid unicode point?
if (_codePoint<_minCodePoint || (_codePoint>=0xD800 && _codePoint<=0xDFFF))
{
_appendable.append(REPLACEMENT);
_expectedContinuationBytes=0;
_codePoint=0;
throw new NotUtf8Exception();
}
_minCodePoint=0;
char[] chars = Character.toChars(_codePoint);
for (char c : chars)
_appendable.append(c);
}
}
// Else this is not a continuation character
else else
{ {
if ((b&0xc0)==0xc0) // ! 10xxxxxx
{ // 11?????? _appendable.append(REPLACEMENT);
_appendable.append('?'); _expectedContinuationBytes=0;
_more=0; _codePoint=0;
_bits=0; throw new NotUtf8Exception();
throw new NotUtf8Exception();
}
else
{
// 10xxxxxx
_bits=(_bits<<6)|(b&0x3f);
if (--_more==0)
{
if (_bits>=0xD800 && _bits<=0xDFFF)
throw new NotUtf8Exception();
_appendable.append(new String(Character.toChars(_bits)));
}
}
} }
} }
public static class NotUtf8Exception extends IllegalStateException public static class NotUtf8Exception extends IllegalArgumentException
{ {
public NotUtf8Exception() public NotUtf8Exception()
{ {

View File

@ -53,13 +53,13 @@ public class Utf8StringBuffer extends Utf8Appendable
public void reset() public void reset()
{ {
_buffer.setLength(0); _buffer.setLength(0);
_more=0; _expectedContinuationBytes=0;
_bits=0; _codePoint=0;
} }
public StringBuffer getStringBuffer() public StringBuffer getStringBuffer()
{ {
if (_more!=0) if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception(); throw new NotUtf8Exception();
return _buffer; return _buffer;
} }
@ -67,7 +67,7 @@ public class Utf8StringBuffer extends Utf8Appendable
@Override @Override
public String toString() public String toString()
{ {
if (_more!=0) if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception(); throw new NotUtf8Exception();
return _buffer.toString(); return _buffer.toString();
} }

View File

@ -52,13 +52,13 @@ public class Utf8StringBuilder extends Utf8Appendable
public void reset() public void reset()
{ {
_buffer.setLength(0); _buffer.setLength(0);
_more=0; _expectedContinuationBytes=0;
_bits=0; _codePoint=0;
} }
public StringBuilder getStringBuilder() public StringBuilder getStringBuilder()
{ {
if (_more!=0) if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception(); throw new NotUtf8Exception();
return _buffer; return _buffer;
} }
@ -66,7 +66,7 @@ public class Utf8StringBuilder extends Utf8Appendable
@Override @Override
public String toString() public String toString()
{ {
if (_more!=0) if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception(); throw new NotUtf8Exception();
return _buffer.toString(); return _buffer.toString();
} }

View File

@ -48,9 +48,9 @@ public class Utf8StringBufferTest
buffer.toString(); buffer.toString();
assertTrue(false); assertTrue(false);
} }
catch(IllegalStateException e) catch(Utf8Appendable.NotUtf8Exception e)
{ {
assertTrue(e.toString().indexOf("!UTF-8")>=0); assertTrue(true);
} }
} }
@ -70,11 +70,11 @@ public class Utf8StringBufferTest
buffer.append(bytes[i]); buffer.append(bytes[i]);
assertTrue(false); assertTrue(false);
} }
catch(IllegalStateException e) catch(Utf8Appendable.NotUtf8Exception e)
{ {
assertTrue(e.toString().indexOf("!UTF-8")>=0); assertTrue(e.toString().indexOf("!UTF-8")>=0);
} }
assertEquals("abc?",buffer.toString()); assertEquals("abc\ufffd",buffer.toString());
} }
@Test @Test

View File

@ -25,19 +25,44 @@ public class Utf8StringBuilderTest
public void testInvalid() public void testInvalid()
throws Exception throws Exception
{ {
Utf8StringBuilder buffer = new Utf8StringBuilder(); String[] invalids = {
buffer.append((byte)0xED); "c0af",
buffer.append((byte)0xA0); "EDA080",
try "f08080af",
{ "f8808080af",
buffer.append((byte)0x80); "e080af",
assertTrue(false); "F4908080",
} "fbbfbfbfbf"
catch(Utf8Appendable.NotUtf8Exception e) };
{
assertTrue(true);
}
for (String i : invalids)
{
byte[] bytes = TypeUtil.fromHexString(i);
/* Test what JVM does
try
{
String s = new String(bytes,0,bytes.length,"UTF-8");
System.err.println(i+": "+s);
}
catch(Exception e)
{
System.err.println(i+": "+e);
}
*/
try
{
Utf8StringBuilder buffer = new Utf8StringBuilder();
buffer.append(bytes,0,bytes.length);
assertEquals(i,"not expected",buffer.toString());
}
catch(IllegalArgumentException e)
{
assertTrue(i,true);
}
}
} }
@Test @Test
@ -69,7 +94,7 @@ public class Utf8StringBuilderTest
buffer.toString(); buffer.toString();
assertTrue(false); assertTrue(false);
} }
catch(IllegalStateException e) catch(Utf8Appendable.NotUtf8Exception e)
{ {
assertTrue(e.toString().indexOf("!UTF-8")>=0); assertTrue(e.toString().indexOf("!UTF-8")>=0);
} }
@ -91,11 +116,11 @@ public class Utf8StringBuilderTest
buffer.append(bytes[i]); buffer.append(bytes[i]);
assertTrue(false); assertTrue(false);
} }
catch(IllegalStateException e) catch(Utf8Appendable.NotUtf8Exception e)
{ {
assertTrue(e.toString().indexOf("!UTF-8")>=0); assertTrue(true);
} }
assertEquals("abc?", buffer.toString()); assertEquals("abc\ufffd", buffer.toString());
} }
@ -106,6 +131,7 @@ public class Utf8StringBuilderTest
String source="\uD842\uDF9F"; String source="\uD842\uDF9F";
byte[] bytes=source.getBytes("UTF-8"); byte[] bytes=source.getBytes("UTF-8");
// System.err.println(TypeUtil.toHexString(bytes));
String jvmcheck = new String(bytes,0,bytes.length,"UTF-8"); String jvmcheck = new String(bytes,0,bytes.length,"UTF-8");
assertEquals(source,jvmcheck); assertEquals(source,jvmcheck);

View File

@ -30,6 +30,7 @@ import org.eclipse.jetty.io.EndPoint;
import org.eclipse.jetty.io.nio.SelectChannelEndPoint; import org.eclipse.jetty.io.nio.SelectChannelEndPoint;
import org.eclipse.jetty.util.B64Code; import org.eclipse.jetty.util.B64Code;
import org.eclipse.jetty.util.StringUtil; import org.eclipse.jetty.util.StringUtil;
import org.eclipse.jetty.util.TypeUtil;
import org.eclipse.jetty.util.Utf8Appendable; import org.eclipse.jetty.util.Utf8Appendable;
import org.eclipse.jetty.util.Utf8StringBuilder; import org.eclipse.jetty.util.Utf8StringBuilder;
import org.eclipse.jetty.util.log.Log; import org.eclipse.jetty.util.log.Log;
@ -628,6 +629,11 @@ public class WebSocketConnectionD13 extends AbstractConnection implements WebSoc
{ {
boolean lastFrame = isLastFrame(flags); boolean lastFrame = isLastFrame(flags);
System.err.println("flags "+flags);
System.err.println("opcode "+opcode);
System.err.println("buffer "+TypeUtil.toHexString(buffer.asArray()));
synchronized(WebSocketConnectionD13.this) synchronized(WebSocketConnectionD13.this)
{ {
// Ignore incoming after a close // Ignore incoming after a close
@ -827,19 +833,21 @@ public class WebSocketConnectionD13 extends AbstractConnection implements WebSoc
return; return;
} }
} }
catch(ThreadDeath th)
{
throw th;
}
catch(Utf8Appendable.NotUtf8Exception notUtf8) catch(Utf8Appendable.NotUtf8Exception notUtf8)
{ {
LOG.warn("{} for {}",notUtf8,_endp); LOG.warn("{} for {}",notUtf8,_endp);
LOG.debug(notUtf8); LOG.debug(notUtf8);
errorClose(WebSocketConnectionD13.CLOSE_BAD_PAYLOAD,"Invalid UTF-8"); errorClose(WebSocketConnectionD13.CLOSE_BAD_PAYLOAD,"Invalid UTF-8");
} }
catch(ThreadDeath th) catch(Throwable probablyNotUtf8)
{ {
throw th; LOG.warn("{} for {}",probablyNotUtf8,_endp);
} LOG.debug(probablyNotUtf8);
catch(Throwable th) errorClose(WebSocketConnectionD13.CLOSE_BAD_PAYLOAD,"Invalid Payload: "+probablyNotUtf8);
{
LOG.warn(th);
} }
} }