Added partial string support to Utf8Appendable

This commit is contained in:
Greg Wilkins 2017-10-19 19:55:40 +11:00
parent 1daa7664ab
commit 7399947d56
7 changed files with 331 additions and 316 deletions

View File

@ -19,6 +19,7 @@
package org.eclipse.jetty.util;
import java.io.IOException;
import java.lang.reflect.GenericArrayType;
import java.nio.ByteBuffer;
import org.eclipse.jetty.util.log.Log;
@ -176,7 +177,12 @@ public abstract class Utf8Appendable
throw new RuntimeException(e);
}
}
public void append(byte[] b)
{
append(b,0,b.length);
}
public void append(byte[] b, int offset, int length)
{
try
@ -285,6 +291,27 @@ public abstract class Utf8Appendable
}
}
/**
* @return The UTF8 so far decoded, ignoring partial code points
*/
public abstract String getPartialString();
/**
* Take the partial string an reset in internal buffer, but retain
* partial code points.
* @return The UTF8 so far decoded, ignoring partial code points
*/
public String takePartialString()
{
String partial = getPartialString();
int save = _state;
reset();
_state = save;
return partial;
}
public String toReplacedString()
{
if (!isUtf8SequenceComplete())
@ -305,4 +332,5 @@ public abstract class Utf8Appendable
}
return _appendable.toString();
}
}

View File

@ -60,6 +60,12 @@ public class Utf8StringBuffer extends Utf8Appendable
_buffer.setLength(0);
}
@Override
public String getPartialString()
{
return _buffer.toString();
}
public StringBuffer getStringBuffer()
{
checkState();

View File

@ -61,6 +61,12 @@ public class Utf8StringBuilder extends Utf8Appendable
_buffer.setLength(0);
}
@Override
public String getPartialString()
{
return _buffer.toString();
}
public StringBuilder getStringBuilder()
{
checkState();

View File

@ -0,0 +1,290 @@
//
// ========================================================================
// Copyright (c) 1995-2017 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// and Apache License v2.0 which accompanies this distribution.
//
// The Eclipse Public License is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// The Apache License v2.0 is available at
// http://www.opensource.org/licenses/apache2.0.php
//
// You may elect to redistribute this code under either of these licenses.
// ========================================================================
//
package org.eclipse.jetty.util;
import static org.hamcrest.Matchers.is;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import org.eclipse.jetty.util.Utf8Appendable.NotUtf8Exception;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
@RunWith(value = Parameterized.class)
public class Utf8AppendableTest
{
@Parameterized.Parameters
public static Collection<Object[]> data()
{
Object[][] data = new Object[][]{
{Utf8StringBuilder.class},
{Utf8StringBuffer.class},
};
return Arrays.asList(data);
}
private final Class<Utf8Appendable> test;
public Utf8AppendableTest(Class<Utf8Appendable> test)
{
this.test = test;
}
Utf8Appendable newBuffer()
{
try
{
return test.getConstructor().newInstance();
}
catch(Exception e)
{
throw new RuntimeException(e);
}
}
@Test
public void testUtf() throws Exception
{
String source = "abcd012345\n\r\u0000\u00a4\u10fb\ufffdjetty";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
Utf8Appendable buffer = newBuffer();
for (byte aByte : bytes)
buffer.append(aByte);
assertEquals(source,buffer.toString());
assertTrue(buffer.toString().endsWith("jetty"));
}
@Test(expected = IllegalArgumentException.class)
public void testUtf8WithMissingByte() throws Exception
{
String source = "abc\u10fb";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
Utf8Appendable buffer = newBuffer();
for (int i = 0; i < bytes.length - 1; i++)
buffer.append(bytes[i]);
buffer.toString();
}
@Test(expected = Utf8Appendable.NotUtf8Exception.class)
public void testUtf8WithAdditionalByte() throws Exception
{
String source = "abcXX";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
bytes[3] = (byte)0xc0;
bytes[4] = (byte)0x00;
Utf8Appendable buffer = newBuffer();
for (byte aByte : bytes)
buffer.append(aByte);
}
@Test
public void testUTF32codes() throws Exception
{
String source = "\uD842\uDF9F";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
String jvmcheck = new String(bytes,0,bytes.length,StandardCharsets.UTF_8);
assertEquals(source,jvmcheck);
Utf8Appendable buffer = newBuffer();
buffer.append(bytes,0,bytes.length);
String result = buffer.toString();
assertEquals(source,result);
}
@Test
public void testGermanUmlauts() throws Exception
{
byte[] bytes = new byte[6];
bytes[0] = (byte)0xC3;
bytes[1] = (byte)0xBC;
bytes[2] = (byte)0xC3;
bytes[3] = (byte)0xB6;
bytes[4] = (byte)0xC3;
bytes[5] = (byte)0xA4;
Utf8Appendable buffer = newBuffer();
for (int i = 0; i < bytes.length; i++)
buffer.append(bytes[i]);
assertEquals("\u00FC\u00F6\u00E4",buffer.toString());
}
@Test(expected = Utf8Appendable.NotUtf8Exception.class)
public void testInvalidUTF8() throws UnsupportedEncodingException
{
Utf8Appendable buffer = newBuffer();
buffer.append((byte)0xC2);
buffer.append((byte)0xC2);
}
@Test
public void testFastFail_1() throws Exception
{
byte[] part1 = TypeUtil.fromHexString("cebae1bdb9cf83cebcceb5");
byte[] part2 = TypeUtil.fromHexString("f4908080"); // INVALID
// Here for test tracking reasons, not needed to satisfy test
// byte[] part3 = TypeUtil.fromHexString("656469746564");
Utf8Appendable buffer = newBuffer();
// Part 1 is valid
buffer.append(part1,0,part1.length);
try
{
// Part 2 is invalid
buffer.append(part2,0,part2.length);
Assert.fail("Should have thrown a NotUtf8Exception");
}
catch (Utf8Appendable.NotUtf8Exception e)
{
// expected path
}
}
@Test
public void testFastFail_2() throws Exception
{
byte[] part1 = TypeUtil.fromHexString("cebae1bdb9cf83cebcceb5f4");
byte[] part2 = TypeUtil.fromHexString("90"); // INVALID
// Here for test search/tracking reasons, not needed to satisfy test
// byte[] part3 = TypeUtil.fromHexString("8080656469746564");
Utf8Appendable buffer = newBuffer();
// Part 1 is valid
buffer.append(part1,0,part1.length);
try
{
// Part 2 is invalid
buffer.append(part2,0,part2.length);
Assert.fail("Should have thrown a NotUtf8Exception");
}
catch (Utf8Appendable.NotUtf8Exception e)
{
// expected path
}
}
@Test
public void testPartial_UnsplitCodepoint()
{
Utf8Appendable utf8 = newBuffer();
String seq1 = "Hello-\uC2B5@\uC39F\uC3A4";
String seq2 = "\uC3BC\uC3A0\uC3A1-UTF-8!!";
utf8.append(BufferUtil.toBuffer(seq1,StandardCharsets.UTF_8));
String ret1 = utf8.takePartialString();
utf8.append(BufferUtil.toBuffer(seq2,StandardCharsets.UTF_8));
String ret2 = utf8.takePartialString();
assertThat("Seq1",ret1,is(seq1));
assertThat("Seq2",ret2,is(seq2));
}
@Test
public void testPartial_SplitCodepoint()
{
Utf8Appendable utf8 = newBuffer();
String seq1 = "48656C6C6F2DEC8AB540EC8E9FEC8E";
String seq2 = "A4EC8EBCEC8EA0EC8EA12D5554462D382121";
utf8.append(TypeUtil.fromHexString(seq1));
String ret1 = utf8.takePartialString();
utf8.append(TypeUtil.fromHexString(seq2));
String ret2 = utf8.takePartialString();
assertThat("Seq1",ret1,is("Hello-\uC2B5@\uC39F"));
assertThat("Seq2",ret2,is("\uC3A4\uC3BC\uC3A0\uC3A1-UTF-8!!"));
}
@Test
public void testPartial_SplitCodepoint_WithNoBuf()
{
Utf8Appendable utf8 = newBuffer();
String seq1 = "48656C6C6F2DEC8AB540EC8E9FEC8E";
String seq2 = "A4EC8EBCEC8EA0EC8EA12D5554462D382121";
utf8.append(TypeUtil.fromHexString(seq1));
String ret1 = utf8.takePartialString();
String ret2 = utf8.takePartialString();
utf8.append(TypeUtil.fromHexString(seq2));
String ret3 = utf8.takePartialString();
assertThat("Seq1",ret1,is("Hello-\uC2B5@\uC39F"));
assertThat("Seq2",ret2,is(""));
assertThat("Seq3",ret3,is("\uC3A4\uC3BC\uC3A0\uC3A1-UTF-8!!"));
}
@Test
public void testBadUtf8()
{
List<String> data = new ArrayList<>();
data.add("c0af");
data.add("EDA080");
data.add("f08080af");
data.add("f8808080af");
data.add("e080af");
data.add("F4908080");
data.add("fbbfbfbfbf");
data.add("10FFFF");
data.add("CeBaE1BdB9Cf83CeBcCeB5EdA080656469746564");
// use of UTF-16 High Surrogates (in codepoint form)
data.add("da07");
data.add("d807");
// decoded UTF-16 High Surrogate "\ud807" (in UTF-8 form)
data.add("EDA087");
data.forEach(s->
{
try
{
Utf8Appendable utf8 = newBuffer();
utf8.append(TypeUtil.fromHexString(s));
Assert.fail();
}
catch(NotUtf8Exception e)
{
// expected
}
});
}
}

View File

@ -1,107 +0,0 @@
//
// ========================================================================
// Copyright (c) 1995-2017 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// and Apache License v2.0 which accompanies this distribution.
//
// The Eclipse Public License is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// The Apache License v2.0 is available at
// http://www.opensource.org/licenses/apache2.0.php
//
// You may elect to redistribute this code under either of these licenses.
// ========================================================================
//
package org.eclipse.jetty.util;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import org.junit.Test;
public class Utf8StringBufferTest
{
@Test
public void testUtfStringBuffer() throws Exception
{
String source = "abcd012345\n\r\u0000\u00a4\u10fb\ufffdjetty";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
Utf8StringBuffer buffer = new Utf8StringBuffer();
for (byte aByte : bytes)
buffer.append(aByte);
assertEquals(source,buffer.toString());
assertTrue(buffer.toString().endsWith("jetty"));
}
@Test(expected = IllegalArgumentException.class)
public void testUtf8WithMissingByte() throws Exception
{
String source = "abc\u10fb";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
Utf8StringBuffer buffer = new Utf8StringBuffer();
for (int i = 0; i < bytes.length - 1; i++)
buffer.append(bytes[i]);
buffer.toString();
}
@Test(expected = Utf8Appendable.NotUtf8Exception.class)
public void testUtf8WithAdditionalByte() throws Exception
{
String source = "abcXX";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
bytes[3] = (byte)0xc0;
bytes[4] = (byte)0x00;
Utf8StringBuffer buffer = new Utf8StringBuffer();
for (byte aByte : bytes)
buffer.append(aByte);
}
@Test
public void testUTF32codes() throws Exception
{
String source = "\uD842\uDF9F";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
String jvmcheck = new String(bytes,0,bytes.length,StandardCharsets.UTF_8);
assertEquals(source,jvmcheck);
Utf8StringBuffer buffer = new Utf8StringBuffer();
buffer.append(bytes,0,bytes.length);
String result = buffer.toString();
assertEquals(source,result);
}
@Test
public void testGermanUmlauts() throws Exception
{
byte[] bytes = new byte[6];
bytes[0] = (byte)0xC3;
bytes[1] = (byte)0xBC;
bytes[2] = (byte)0xC3;
bytes[3] = (byte)0xB6;
bytes[4] = (byte)0xC3;
bytes[5] = (byte)0xA4;
Utf8StringBuffer buffer = new Utf8StringBuffer();
for (int i = 0; i < bytes.length; i++)
buffer.append(bytes[i]);
assertEquals("\u00FC\u00F6\u00E4",buffer.toString());
}
@Test(expected = Utf8Appendable.NotUtf8Exception.class)
public void testInvalidUTF8() throws UnsupportedEncodingException
{
Utf8StringBuffer buffer = new Utf8StringBuffer();
buffer.append((byte)0xC2);
buffer.append((byte)0xC2);
}
}

View File

@ -1,71 +0,0 @@
//
// ========================================================================
// Copyright (c) 1995-2017 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// and Apache License v2.0 which accompanies this distribution.
//
// The Eclipse Public License is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// The Apache License v2.0 is available at
// http://www.opensource.org/licenses/apache2.0.php
//
// You may elect to redistribute this code under either of these licenses.
// ========================================================================
//
package org.eclipse.jetty.util;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.eclipse.jetty.util.Utf8Appendable.NotUtf8Exception;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;
/**
* Test various invalid UTF8 byte sequences.
*/
@RunWith(Parameterized.class)
public class Utf8StringBuilderInvalidUtfTest
{
@Parameters
public static Collection<Object[]> data() {
List<Object[]> data = new ArrayList<>();
data.add(new String[]{"c0af"});
data.add(new String[]{"EDA080"});
data.add(new String[]{"f08080af"});
data.add(new String[]{"f8808080af"});
data.add(new String[]{"e080af"});
data.add(new String[]{"F4908080"});
data.add(new String[]{"fbbfbfbfbf"});
data.add(new String[]{"10FFFF"});
data.add(new String[]{"CeBaE1BdB9Cf83CeBcCeB5EdA080656469746564"});
// use of UTF-16 High Surrogates (in codepoint form)
data.add(new String[]{"da07"});
data.add(new String[]{"d807"});
// decoded UTF-16 High Surrogate "\ud807" (in UTF-8 form)
data.add(new String[]{"EDA087"});
return data;
}
private byte[] bytes;
public Utf8StringBuilderInvalidUtfTest(String rawhex)
{
bytes = TypeUtil.fromHexString(rawhex);
System.out.printf("Utf8StringBuilderInvalidUtfTest[] (%s)%n", TypeUtil.toHexString(bytes));
}
@Test(expected=NotUtf8Exception.class)
public void testInvalidUTF8()
{
Utf8StringBuilder buffer = new Utf8StringBuilder();
buffer.append(bytes,0,bytes.length);
}
}

View File

@ -1,137 +0,0 @@
//
// ========================================================================
// Copyright (c) 1995-2017 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// and Apache License v2.0 which accompanies this distribution.
//
// The Eclipse Public License is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// The Apache License v2.0 is available at
// http://www.opensource.org/licenses/apache2.0.php
//
// You may elect to redistribute this code under either of these licenses.
// ========================================================================
//
package org.eclipse.jetty.util;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
import org.junit.Test;
public class Utf8StringBuilderTest
{
@Test
public void testFastFail_1() throws Exception
{
byte[] part1 = TypeUtil.fromHexString("cebae1bdb9cf83cebcceb5");
byte[] part2 = TypeUtil.fromHexString("f4908080"); // INVALID
// Here for test tracking reasons, not needed to satisfy test
// byte[] part3 = TypeUtil.fromHexString("656469746564");
Utf8StringBuilder buffer = new Utf8StringBuilder();
// Part 1 is valid
buffer.append(part1,0,part1.length);
try
{
// Part 2 is invalid
buffer.append(part2,0,part2.length);
Assert.fail("Should have thrown a NotUtf8Exception");
}
catch (Utf8Appendable.NotUtf8Exception e)
{
// expected path
}
}
@Test
public void testFastFail_2() throws Exception
{
byte[] part1 = TypeUtil.fromHexString("cebae1bdb9cf83cebcceb5f4");
byte[] part2 = TypeUtil.fromHexString("90"); // INVALID
// Here for test search/tracking reasons, not needed to satisfy test
// byte[] part3 = TypeUtil.fromHexString("8080656469746564");
Utf8StringBuilder buffer = new Utf8StringBuilder();
// Part 1 is valid
buffer.append(part1,0,part1.length);
try
{
// Part 2 is invalid
buffer.append(part2,0,part2.length);
Assert.fail("Should have thrown a NotUtf8Exception");
}
catch (Utf8Appendable.NotUtf8Exception e)
{
// expected path
}
}
@Test
public void testUtfStringBuilder() throws Exception
{
String source = "abcd012345\n\r\u0000\u00a4\u10fb\ufffdjetty";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
Utf8StringBuilder buffer = new Utf8StringBuilder();
for (byte aByte : bytes)
buffer.append(aByte);
assertEquals(source,buffer.toString());
assertTrue(buffer.toString().endsWith("jetty"));
}
@Test(expected = IllegalArgumentException.class)
public void testShort() throws Exception
{
String source = "abc\u10fb";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
Utf8StringBuilder buffer = new Utf8StringBuilder();
for (int i = 0; i < bytes.length - 1; i++)
buffer.append(bytes[i]);
buffer.toString();
}
@Test
public void testLong() throws Exception
{
String source = "abcXX";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
bytes[3] = (byte)0xc0;
bytes[4] = (byte)0x00;
Utf8StringBuilder buffer = new Utf8StringBuilder();
try
{
for (byte aByte : bytes) {
buffer.append(aByte);
}
Assert.fail("Should have resulted in an Utf8Appendable.NotUtf8Exception");
}
catch (Utf8Appendable.NotUtf8Exception e)
{
// expected path
}
assertEquals("abc\ufffd",buffer.toString());
}
@Test
public void testUTF32codes() throws Exception
{
String source = "\uD842\uDF9F";
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
String jvmcheck = new String(bytes,0,bytes.length,StandardCharsets.UTF_8);
assertEquals(source,jvmcheck);
Utf8StringBuilder buffer = new Utf8StringBuilder();
buffer.append(bytes,0,bytes.length);
String result = buffer.toString();
assertEquals(source,result);
}
}