Completed SearchPattern class and tests

Signed-off-by: Lachlan Roberts <lachlan@webtide.com>
This commit is contained in:
Lachlan Roberts 2018-03-07 17:38:44 +11:00
parent f68cf85e94
commit 809aee129e
2 changed files with 221 additions and 65 deletions

View File

@ -18,34 +18,55 @@
package org.eclipse.jetty.util;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
/**
* SearchPattern
*
* Fast search for patterns within strings and arrays of bytes.
* Uses an implementation of the BoyerMooreHorspool algorithm
* with a 256 character alphabet.
*
* The algorithm has an average-case complexity of O(n)
* on random text and O(nm) in the worst case.
* where:
* m = pattern length
* n = length of data to search
*/
public class SearchPattern
{
static final int alphabetSize = 256;
int[] table;
byte[] pattern;
public int[] getTable(){ return this.table; }
private int[] table;
private byte[] pattern;
/**
* @param pattern The pattern to search for.
* @return A Pattern instance for the search pattern
*/
static SearchPattern compile(byte[] pattern)
public static SearchPattern compile(byte[] pattern)
{
//Create new SearchPattern instance
SearchPattern sp = new SearchPattern();
return new SearchPattern(Arrays.copyOf(pattern, pattern.length));
}
public static SearchPattern compile(String pattern)
{
return new SearchPattern(pattern.getBytes(StandardCharsets.UTF_8));
}
private SearchPattern(byte[] pattern)
{
this.pattern = pattern;
//Copy in the Pattern
sp.pattern = pattern.clone();
if(pattern.length == 0)
throw new IllegalArgumentException("Empty Pattern");
//Build up the pre-processed table for this pattern.
sp.table = new int[alphabetSize];
for(int i = 0; i<sp.table.length; ++i)
sp.table[i] = sp.pattern.length;
for(int i = 0; i<sp.pattern.length-1; ++i)
sp.table[sp.pattern[i]] = sp.pattern.length-1-i;
return sp;
table = new int[alphabetSize];
for(int i = 0; i<table.length; ++i)
table[i] = pattern.length;
for(int i = 0; i<pattern.length-1; ++i)
table[pattern[i]] = pattern.length-1-i;
}
@ -59,8 +80,10 @@ public class SearchPattern
*/
public int match(byte[] data, int offset, int length)
{
validate(data, offset, length);
int skip = offset;
while((skip <= data.length - pattern.length) && (skip+pattern.length <= offset+length))
while(skip <= offset+length - pattern.length)
{
for(int i = pattern.length-1; data[skip+i] == pattern[i]; i--)
if(i==0) return skip;
@ -77,25 +100,68 @@ public class SearchPattern
* but the pattern will always be {@link StandardCharsets#US_ASCII} encoded.
* @param offset The offset within the data to start the search
* @param length The length of the data to search
* @return the length of the partial pattern matched or -1 for no match.
* @return the length of the partial pattern matched and 0 for no match.
*/
public int endsPartiallyWith(byte data, int offset, int length)
{
return -1;
public int endsWith(byte[] data, int offset, int length)
{
validate(data, offset, length);
int skip = (pattern.length <= length) ? (offset+length-pattern.length) : offset;
while(skip < offset+length)
{
for(int i = (offset+length-1)-skip; data[skip+i] == pattern[i]; --i)
if(i==0) return(offset+length - skip);
if(skip + pattern.length - 1 < data.length)
skip += table[data[skip + pattern.length - 1]];
else
skip++;
}
return 0;
}
/**
*
* Search for a partial match of the pattern at the start of the data.
* Search for a possibly partial match of the pattern at the start of the data.
* @param data The data in which to search for. The data may be arbitrary binary data,
* but the pattern will always be {@link StandardCharsets#US_ASCII} encoded.
* @param offset The offset within the data to start the search
* @param length The length of the data to search
* @param matched The length of the partial pattern already matched
* @return the length of the partial pattern matched or -1 for no match.
* @return the length of the partial pattern matched and 0 for no match.
*/
public int startsPartialyWith(byte[] data, int offset, int length, int matched)
public int startsWith(byte[] data, int offset, int length, int matched)
{
return -1;
validate(data, offset, length);
int matchedCount = 0;
for(int i=0; i<pattern.length-matched && i < offset+length; i++)
{
if(data[i] == pattern[i+matched])
matchedCount++;
else
break;
}
return matchedCount;
}
/**
* Performs legality checks for standard arguments input into SearchPattern methods.
* @param data The data in which to search for. The data may be arbitrary binary data,
* but the pattern will always be {@link StandardCharsets#US_ASCII} encoded.
* @param offset The offset within the data to start the search
* @param length The length of the data to search
*/
private void validate(byte[] data, int offset, int length)
{
if (offset < 0)
throw new IllegalArgumentException("offset was negative");
else if (length < 0)
throw new IllegalArgumentException("length was negative");
else if (offset + length > data.length)
throw new IllegalArgumentException("(offset+length) out of bounds of data[]");
}
}

View File

@ -20,45 +20,46 @@ package org.eclipse.jetty.util;
import static org.junit.Assert.*;
import java.nio.charset.StandardCharsets;
import org.junit.Assert;
import org.junit.Test;
public class SearchPatternTest
{
@Test
public void testBasicSearch()
{
String p1 = "truth";
String p2 = "evident";
String p3 = "we";
String d = "we hold these truths to be self evident";
byte[] p1 = new String("truth").getBytes(StandardCharsets.US_ASCII);
byte[] p2 = new String("evident").getBytes(StandardCharsets.US_ASCII);
byte[] p3 = new String("we").getBytes(StandardCharsets.US_ASCII);
byte[] d = new String("we hold these truths to be self evident").getBytes(StandardCharsets.US_ASCII);
// Testing Compiled Pattern p1 "truth"
SearchPattern sp1 = SearchPattern.compile(p1.getBytes());
Assert.assertEquals(14,sp1.match(d.getBytes(), 0, d.length()));
Assert.assertEquals(14,sp1.match(d.getBytes(),14,p1.length()));
Assert.assertEquals(14,sp1.match(d.getBytes(),14,p1.length()+1));
Assert.assertEquals(-1,sp1.match(d.getBytes(),14,p1.length()-1));
Assert.assertEquals(-1,sp1.match(d.getBytes(),15,d.length()));
SearchPattern sp1 = SearchPattern.compile(p1);
Assert.assertEquals(14,sp1.match(d, 0, d.length));
Assert.assertEquals(14,sp1.match(d,14,p1.length));
Assert.assertEquals(14,sp1.match(d,14,p1.length+1));
Assert.assertEquals(-1,sp1.match(d,14,p1.length-1));
Assert.assertEquals(-1,sp1.match(d,15,d.length-15));
// Testing Compiled Pattern p2 "evident"
SearchPattern sp2 = SearchPattern.compile(p2.getBytes());
Assert.assertEquals(32,sp2.match(d.getBytes(), 0, d.length()));
Assert.assertEquals(32,sp2.match(d.getBytes(),32,p2.length()));
Assert.assertEquals(32,sp2.match(d.getBytes(),32,p2.length()+1));
Assert.assertEquals(-1,sp2.match(d.getBytes(),32,p2.length()-1));
Assert.assertEquals(-1,sp2.match(d.getBytes(),33,d.length()));
SearchPattern sp2 = SearchPattern.compile(p2);
Assert.assertEquals(32,sp2.match(d, 0, d.length));
Assert.assertEquals(32,sp2.match(d,32,p2.length));
Assert.assertEquals(32,sp2.match(d,32,p2.length));
Assert.assertEquals(-1,sp2.match(d,32,p2.length-1));
Assert.assertEquals(-1,sp2.match(d,33,d.length-33));
// Testing Compiled Pattern p3 "evident"
SearchPattern sp3 = SearchPattern.compile(p3.getBytes());
Assert.assertEquals( 0,sp3.match(d.getBytes(), 0, d.length()));
Assert.assertEquals( 0,sp3.match(d.getBytes(), 0, p3.length()));
Assert.assertEquals( 0,sp3.match(d.getBytes(), 0, p3.length()+1));
Assert.assertEquals(-1,sp3.match(d.getBytes(), 0, p3.length()-1));
Assert.assertEquals(-1,sp3.match(d.getBytes(), 1, d.length()));
SearchPattern sp3 = SearchPattern.compile(p3);
Assert.assertEquals( 0,sp3.match(d, 0, d.length));
Assert.assertEquals( 0,sp3.match(d, 0, p3.length));
Assert.assertEquals( 0,sp3.match(d, 0, p3.length+1));
Assert.assertEquals(-1,sp3.match(d, 0, p3.length-1));
Assert.assertEquals(-1,sp3.match(d, 1, d.length-1));
}
@ -66,17 +67,106 @@ public class SearchPatternTest
@Test
public void testDoubleMatch()
{
String p = "violent";
String d = "These violent delights have violent ends.";
// Testing Compiled Pattern p1 "truth"
SearchPattern sp = SearchPattern.compile(p.getBytes());
Assert.assertEquals( 6,sp.match(d.getBytes(), 0, d.length()));
Assert.assertEquals(-1,sp.match(d.getBytes(), 6, p.length()-1));
Assert.assertEquals(28,sp.match(d.getBytes(), 7, d.length()));
Assert.assertEquals(28,sp.match(d.getBytes(), 28, d.length()));
Assert.assertEquals(-1,sp.match(d.getBytes(), 29, d.length()));
byte[] p = new String("violent").getBytes(StandardCharsets.US_ASCII);
byte[] d = new String("These violent delights have violent ends.").getBytes(StandardCharsets.US_ASCII);
SearchPattern sp = SearchPattern.compile(p);
Assert.assertEquals( 6,sp.match(d, 0, d.length));
Assert.assertEquals(-1,sp.match(d, 6, p.length-1));
Assert.assertEquals(28,sp.match(d, 7, d.length-7));
Assert.assertEquals(28,sp.match(d, 28, d.length-28));
Assert.assertEquals(-1,sp.match(d, 29, d.length-29));
}
@Test
public void testAlmostMatch()
{
byte[] p = new String("violent").getBytes(StandardCharsets.US_ASCII);
byte[] d = new String("vio lent violen v iolent violin vioviolenlent viiolent").getBytes(StandardCharsets.US_ASCII);
SearchPattern sp = SearchPattern.compile(p);
Assert.assertEquals(-1,sp.match(d, 0, d.length));
}
@Test
public void testOddSizedPatterns()
{
// Test Large Pattern
byte[] p = new String("pneumonoultramicroscopicsilicovolcanoconiosis").getBytes(StandardCharsets.US_ASCII);
byte[] d = new String("pneumon").getBytes(StandardCharsets.US_ASCII);
SearchPattern sp = SearchPattern.compile(p);
Assert.assertEquals(-1,sp.match(d, 0, d.length));
// Test Single Character Pattern
p = new String("s").getBytes(StandardCharsets.US_ASCII);
d = new String("the cake is a lie").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(10,sp.match(d, 0, d.length));
}
@Test
public void testEndsWith()
{
byte[] p = new String("pneumonoultramicroscopicsilicovolcanoconiosis").getBytes(StandardCharsets.US_ASCII);
byte[] d = new String("pneumonoultrami").getBytes(StandardCharsets.US_ASCII);
SearchPattern sp = SearchPattern.compile(p);
Assert.assertEquals(15,sp.endsWith(d,0,d.length));
p = new String("abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
d = new String("abcdefghijklmnopqrstuvwxyzabcdefghijklmno").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(0,sp.match(d,0,d.length));
Assert.assertEquals(-1,sp.match(d,1,d.length-1));
Assert.assertEquals(15,sp.endsWith(d,0,d.length));
p = new String("abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
d = new String("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(0,sp.match(d,0,d.length));
Assert.assertEquals(26,sp.match(d,1,d.length-1));
Assert.assertEquals(26,sp.endsWith(d,0,d.length));
//test no match
p = new String("hello world").getBytes(StandardCharsets.US_ASCII);
d = new String("there is definitely no match in here").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(0,sp.endsWith(d,0,d.length));
}
@Test
public void testStartsWith()
{
byte[] p = new String("abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
byte[] d = new String("ijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
SearchPattern sp = SearchPattern.compile(p);
Assert.assertEquals(18,sp.match(d,0,d.length));
Assert.assertEquals(-1,sp.match(d,19,d.length-19));
Assert.assertEquals(18,sp.startsWith(d,0,d.length,8));
p = new String("abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
d = new String("ijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(19,sp.match(d,0,d.length));
Assert.assertEquals(-1,sp.match(d,20,d.length-20));
Assert.assertEquals(18,sp.startsWith(d,0,d.length,8));
p = new String("abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
d = new String("abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(26,sp.startsWith(d,0,d.length,0));
//test no match
p = new String("hello world").getBytes(StandardCharsets.US_ASCII);
d = new String("there is definitely no match in here").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(0,sp.startsWith(d,0,d.length,0));
//test large pattern small buffer
p = new String("abcdefghijklmnopqrstuvwxyz").getBytes(StandardCharsets.US_ASCII);
d = new String("mnopqrs").getBytes(StandardCharsets.US_ASCII);
sp = SearchPattern.compile(p);
Assert.assertEquals(7,sp.startsWith(d,0,d.length,12));
}
}