357338 Improve UTF-8 validation

This commit is contained in:
Greg Wilkins 2011-09-12 13:14:09 +10:00
parent eab970e5c0
commit c5e6378b84
6 changed files with 158 additions and 101 deletions

View File

@ -5,9 +5,11 @@ import java.util.IllegalFormatCodePointException;
public abstract class Utf8Appendable
{
private final char REPLACEMENT = '\ufffd';
protected final Appendable _appendable;
protected int _more;
protected int _bits;
protected int _expectedContinuationBytes;
protected int _codePoint;
protected int _minCodePoint;
public Utf8Appendable(Appendable appendable)
{
@ -63,91 +65,112 @@ public abstract class Utf8Appendable
protected void appendByte(byte b) throws IOException
{
// Check for invalid bytes
if (b==(byte)0xc0 || b==(byte)0xc1 || (int)b>=0xf5)
{
_appendable.append(REPLACEMENT);
_expectedContinuationBytes=0;
_codePoint=0;
throw new NotUtf8Exception();
}
// Is it plain ASCII?
if (b>=0)
{
if (_more>0)
// Were we expecting a continuation byte?
if (_expectedContinuationBytes>0)
{
_appendable.append('?');
_more=0;
_bits=0;
_appendable.append(REPLACEMENT);
_expectedContinuationBytes=0;
_codePoint=0;
throw new NotUtf8Exception();
}
else
_appendable.append((char)(0x7f&b));
}
else if (_more==0)
// Else is this a start byte
else if (_expectedContinuationBytes==0)
{
if ((b&0xc0)!=0xc0)
if ((b & 0xe0) == 0xc0)
{
// 10xxxxxx
_appendable.append('?');
_more=0;
_bits=0;
throw new NotUtf8Exception();
//110xxxxx
_expectedContinuationBytes=1;
_codePoint=b&0x1f;
_minCodePoint=0x80;
}
else if ((b & 0xf0) == 0xe0)
{
//1110xxxx
_expectedContinuationBytes=2;
_codePoint=b&0x0f;
_minCodePoint=0x800;
}
else if ((b & 0xf8) == 0xf0)
{
//11110xxx
_expectedContinuationBytes=3;
_codePoint=b&0x07;
_minCodePoint=0x10000;
}
else if ((b & 0xfc) == 0xf8)
{
//111110xx
_expectedContinuationBytes=4;
_codePoint=b&0x03;
_minCodePoint=0x200000;
}
else if ((b & 0xfe) == 0xfc)
{
//1111110x
_expectedContinuationBytes=5;
_codePoint=b&0x01;
_minCodePoint=0x400000;
}
else
{
if ((b & 0xe0) == 0xc0)
{
//110xxxxx
_more=1;
_bits=b&0x1f;
}
else if ((b & 0xf0) == 0xe0)
{
//1110xxxx
_more=2;
_bits=b&0x0f;
}
else if ((b & 0xf8) == 0xf0)
{
//11110xxx
_more=3;
_bits=b&0x07;
}
else if ((b & 0xfc) == 0xf8)
{
//111110xx
_more=4;
_bits=b&0x03;
}
else if ((b & 0xfe) == 0xfc)
{
//1111110x
_more=5;
_bits=b&0x01;
}
else
{
throw new NotUtf8Exception();
}
{
_appendable.append(REPLACEMENT);
_expectedContinuationBytes=0;
_codePoint=0;
throw new NotUtf8Exception();
}
}
// else is this a continuation character
else if ((b&0xc0)==0x80)
{
// 10xxxxxx
_codePoint=(_codePoint<<6)|(b&0x3f);
// was that the last continuation?
if (--_expectedContinuationBytes==0)
{
// If this a valid unicode point?
if (_codePoint<_minCodePoint || (_codePoint>=0xD800 && _codePoint<=0xDFFF))
{
_appendable.append(REPLACEMENT);
_expectedContinuationBytes=0;
_codePoint=0;
throw new NotUtf8Exception();
}
_minCodePoint=0;
char[] chars = Character.toChars(_codePoint);
for (char c : chars)
_appendable.append(c);
}
}
// Else this is not a continuation character
else
{
if ((b&0xc0)==0xc0)
{ // 11??????
_appendable.append('?');
_more=0;
_bits=0;
throw new NotUtf8Exception();
}
else
{
// 10xxxxxx
_bits=(_bits<<6)|(b&0x3f);
if (--_more==0)
{
if (_bits>=0xD800 && _bits<=0xDFFF)
throw new NotUtf8Exception();
_appendable.append(new String(Character.toChars(_bits)));
}
}
// ! 10xxxxxx
_appendable.append(REPLACEMENT);
_expectedContinuationBytes=0;
_codePoint=0;
throw new NotUtf8Exception();
}
}
public static class NotUtf8Exception extends IllegalStateException
public static class NotUtf8Exception extends IllegalArgumentException
{
public NotUtf8Exception()
{

View File

@ -53,13 +53,13 @@ public class Utf8StringBuffer extends Utf8Appendable
public void reset()
{
_buffer.setLength(0);
_more=0;
_bits=0;
_expectedContinuationBytes=0;
_codePoint=0;
}
public StringBuffer getStringBuffer()
{
if (_more!=0)
if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception();
return _buffer;
}
@ -67,7 +67,7 @@ public class Utf8StringBuffer extends Utf8Appendable
@Override
public String toString()
{
if (_more!=0)
if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception();
return _buffer.toString();
}

View File

@ -52,13 +52,13 @@ public class Utf8StringBuilder extends Utf8Appendable
public void reset()
{
_buffer.setLength(0);
_more=0;
_bits=0;
_expectedContinuationBytes=0;
_codePoint=0;
}
public StringBuilder getStringBuilder()
{
if (_more!=0)
if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception();
return _buffer;
}
@ -66,7 +66,7 @@ public class Utf8StringBuilder extends Utf8Appendable
@Override
public String toString()
{
if (_more!=0)
if (_expectedContinuationBytes!=0)
throw new NotUtf8Exception();
return _buffer.toString();
}

View File

@ -48,9 +48,9 @@ public class Utf8StringBufferTest
buffer.toString();
assertTrue(false);
}
catch(IllegalStateException e)
catch(Utf8Appendable.NotUtf8Exception e)
{
assertTrue(e.toString().indexOf("!UTF-8")>=0);
assertTrue(true);
}
}
@ -70,11 +70,11 @@ public class Utf8StringBufferTest
buffer.append(bytes[i]);
assertTrue(false);
}
catch(IllegalStateException e)
catch(Utf8Appendable.NotUtf8Exception e)
{
assertTrue(e.toString().indexOf("!UTF-8")>=0);
}
assertEquals("abc?",buffer.toString());
assertEquals("abc\ufffd",buffer.toString());
}
@Test

View File

@ -25,19 +25,44 @@ public class Utf8StringBuilderTest
public void testInvalid()
throws Exception
{
Utf8StringBuilder buffer = new Utf8StringBuilder();
buffer.append((byte)0xED);
buffer.append((byte)0xA0);
try
{
buffer.append((byte)0x80);
assertTrue(false);
}
catch(Utf8Appendable.NotUtf8Exception e)
{
assertTrue(true);
}
String[] invalids = {
"c0af",
"EDA080",
"f08080af",
"f8808080af",
"e080af",
"F4908080",
"fbbfbfbfbf"
};
for (String i : invalids)
{
byte[] bytes = TypeUtil.fromHexString(i);
/* Test what JVM does
try
{
String s = new String(bytes,0,bytes.length,"UTF-8");
System.err.println(i+": "+s);
}
catch(Exception e)
{
System.err.println(i+": "+e);
}
*/
try
{
Utf8StringBuilder buffer = new Utf8StringBuilder();
buffer.append(bytes,0,bytes.length);
assertEquals(i,"not expected",buffer.toString());
}
catch(IllegalArgumentException e)
{
assertTrue(i,true);
}
}
}
@Test
@ -69,7 +94,7 @@ public class Utf8StringBuilderTest
buffer.toString();
assertTrue(false);
}
catch(IllegalStateException e)
catch(Utf8Appendable.NotUtf8Exception e)
{
assertTrue(e.toString().indexOf("!UTF-8")>=0);
}
@ -91,11 +116,11 @@ public class Utf8StringBuilderTest
buffer.append(bytes[i]);
assertTrue(false);
}
catch(IllegalStateException e)
catch(Utf8Appendable.NotUtf8Exception e)
{
assertTrue(e.toString().indexOf("!UTF-8")>=0);
assertTrue(true);
}
assertEquals("abc?", buffer.toString());
assertEquals("abc\ufffd", buffer.toString());
}
@ -106,6 +131,7 @@ public class Utf8StringBuilderTest
String source="\uD842\uDF9F";
byte[] bytes=source.getBytes("UTF-8");
// System.err.println(TypeUtil.toHexString(bytes));
String jvmcheck = new String(bytes,0,bytes.length,"UTF-8");
assertEquals(source,jvmcheck);

View File

@ -30,6 +30,7 @@ import org.eclipse.jetty.io.EndPoint;
import org.eclipse.jetty.io.nio.SelectChannelEndPoint;
import org.eclipse.jetty.util.B64Code;
import org.eclipse.jetty.util.StringUtil;
import org.eclipse.jetty.util.TypeUtil;
import org.eclipse.jetty.util.Utf8Appendable;
import org.eclipse.jetty.util.Utf8StringBuilder;
import org.eclipse.jetty.util.log.Log;
@ -628,6 +629,11 @@ public class WebSocketConnectionD13 extends AbstractConnection implements WebSoc
{
boolean lastFrame = isLastFrame(flags);
System.err.println("flags "+flags);
System.err.println("opcode "+opcode);
System.err.println("buffer "+TypeUtil.toHexString(buffer.asArray()));
synchronized(WebSocketConnectionD13.this)
{
// Ignore incoming after a close
@ -827,19 +833,21 @@ public class WebSocketConnectionD13 extends AbstractConnection implements WebSoc
return;
}
}
catch(ThreadDeath th)
{
throw th;
}
catch(Utf8Appendable.NotUtf8Exception notUtf8)
{
LOG.warn("{} for {}",notUtf8,_endp);
LOG.debug(notUtf8);
errorClose(WebSocketConnectionD13.CLOSE_BAD_PAYLOAD,"Invalid UTF-8");
}
catch(ThreadDeath th)
catch(Throwable probablyNotUtf8)
{
throw th;
}
catch(Throwable th)
{
LOG.warn(th);
LOG.warn("{} for {}",probablyNotUtf8,_endp);
LOG.debug(probablyNotUtf8);
errorClose(WebSocketConnectionD13.CLOSE_BAD_PAYLOAD,"Invalid Payload: "+probablyNotUtf8);
}
}