diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java index a4fcf6abb..f7aa5b692 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasher.java @@ -16,7 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; -import java.nio.charset.StandardCharsets; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; @@ -35,7 +35,7 @@ public class DynamicHasher implements Hasher { public static class Builder implements Hasher.Builder { /** - * The list of byte[] that are to be hashed. + * The list of items (each as a byte[]) that are to be hashed. */ private final List buffers; @@ -54,35 +54,31 @@ public class DynamicHasher implements Hasher { this.buffers = new ArrayList<>(); } - /** - * Builds the hasher. - * - * @return A DynamicHasher with the specified name, function and buffers. - */ @Override public DynamicHasher build() throws IllegalArgumentException { - return new DynamicHasher(function, buffers); + // Assumes the hasher will create a copy of the buffers + final DynamicHasher hasher = new DynamicHasher(function, buffers); + // Reset for further use + buffers.clear(); + return hasher; } @Override - public final Builder with(final byte property) { - return with(new byte[] {property}); - } - - @Override - public final Builder with(final byte[] property) { + public final DynamicHasher.Builder with(final byte[] property) { buffers.add(property); return this; } - /** - * {@inheritDoc} - * - *

The string is converted to a byte array using the UTF-8 Character set. - */ @Override - public final Builder with(final String property) { - return with(property.getBytes(StandardCharsets.UTF_8)); + public DynamicHasher.Builder with(CharSequence item, Charset charset) { + Hasher.Builder.super.with(item, charset); + return this; + } + + @Override + public DynamicHasher.Builder withUnencoded(CharSequence item) { + Hasher.Builder.super.withUnencoded(item); + return this; } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java index 5816712c0..8f5d5c239 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java @@ -16,6 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter.hasher; +import java.nio.charset.Charset; import java.util.PrimitiveIterator; /** @@ -46,39 +47,62 @@ public interface Hasher { /** * A builder to build a hasher. + * + *

A hasher represents one or more items of arbitrary byte size. The builder + * contains methods to collect byte representations of items. Each method to add + * to the builder will add an entire item to the final hasher created by the + * {@link #build()} method. + * * @since 4.5 */ interface Builder { /** - * Builds the hasher. + * Builds the hasher from all the items. + * + *

This method will clear the builder for future use. + * * @return the fully constructed hasher */ Hasher build(); /** - * Adds a byte to the hasher. + * Adds a byte array item to the hasher. * - * @param property the byte to add + * @param item the item to add * @return a reference to this object */ - Builder with(byte property); + Builder with(byte[] item); /** - * Adds an array of bytes to the hasher. + * Adds a character sequence item to the hasher using the specified {@code charset} + * encoding. * - * @param property the array of bytes to add + * @param item the item to add + * @param charset the character set * @return a reference to this object */ - Builder with(byte[] property); + default Builder with(CharSequence item, Charset charset) { + return with(item.toString().getBytes(charset)); + } /** - * Adds a string to the hasher. + * Adds a character sequence item to the hasher. Each 16-bit character is + * converted to 2 bytes using little-endian order. * - * @param property the string to add + * @param item the item to add * @return a reference to this object */ - Builder with(String property); + default Builder withUnencoded(CharSequence item) { + int length = item.length(); + final byte[] bytes = new byte[length * 2]; + for (int i = 0; i < length; i++) { + final char ch = item.charAt(i); + bytes[i * 2] = (byte) ch; + bytes[i * 2 + 1] = (byte) (ch >>> 8); + } + return with(bytes); + } } /** diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java index 0ed0cc1f9..e9b63bac1 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherBloomFilterTest.java @@ -25,7 +25,7 @@ import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; import org.junit.Assert; import org.junit.Test; - +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.PrimitiveIterator.OfInt; @@ -40,7 +40,7 @@ public class HasherBloomFilterTest extends AbstractBloomFilterTest { @Test public void constructorTest_NonStatic() { final Shape shape = new Shape(new MD5Cyclic(), 3, 72, 17); - final DynamicHasher hasher = new DynamicHasher.Builder(new MD5Cyclic()).with("Hello").build(); + final DynamicHasher hasher = new DynamicHasher.Builder(new MD5Cyclic()).with("Hello", StandardCharsets.UTF_8).build(); final HasherBloomFilter filter = createFilter(hasher, shape); final long[] lb = filter.getBits(); assertEquals(2, lb.length); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java index 0fcf49e3e..94e685cf0 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherBuilderTest.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import java.nio.charset.StandardCharsets; import java.util.PrimitiveIterator.OfInt; import org.apache.commons.collections4.bloomfilter.hasher.function.MD5Cyclic; @@ -32,31 +33,18 @@ import org.junit.Test; public class DynamicHasherBuilderTest { private DynamicHasher.Builder builder; - private final Shape shape = new Shape(new MD5Cyclic(), 1, Integer.MAX_VALUE, 1); - - /** - * Tests that hashing a byte works as expected. - */ - @Test - public void buildTest_byte() { - final DynamicHasher hasher = builder.with((byte) 0x1).build(); - - final int expected = 1483089307; - - final OfInt iter = hasher.iterator(shape); - - assertTrue(iter.hasNext()); - assertEquals(expected, iter.nextInt()); - assertFalse(iter.hasNext()); - } + private HashFunction hf = new MD5Cyclic(); + private final Shape shape = new Shape(hf, 1, 345, 1); + private String testString = HasherBuilderTest.getExtendedString(); /** * Tests that hashing a byte array works as expected. */ @Test public void buildTest_byteArray() { - final DynamicHasher hasher = builder.with("Hello".getBytes()).build(); - final int expected = 1519797563; + final byte[] bytes = testString.getBytes(); + final DynamicHasher hasher = builder.with(bytes).build(); + final int expected = (int) Math.floorMod(hf.apply(bytes, 0), shape.getNumberOfBits()); final OfInt iter = hasher.iterator(shape); @@ -82,8 +70,9 @@ public class DynamicHasherBuilderTest { */ @Test public void buildTest_String() { - final DynamicHasher hasher = builder.with("Hello").build(); - final int expected = 1519797563; + final byte[] bytes = testString.getBytes(StandardCharsets.UTF_8); + final DynamicHasher hasher = builder.with(testString, StandardCharsets.UTF_8).build(); + final int expected = (int) Math.floorMod(hf.apply(bytes, 0), shape.getNumberOfBits()); final OfInt iter = hasher.iterator(shape); @@ -92,11 +81,44 @@ public class DynamicHasherBuilderTest { assertFalse(iter.hasNext()); } + /** + * Tests that hashing a string works as expected. + */ + @Test + public void buildTest_UnencodedString() { + final byte[] bytes = testString.getBytes(StandardCharsets.UTF_16LE); + final DynamicHasher hasher = builder.withUnencoded(testString).build(); + final int expected = (int) Math.floorMod(hf.apply(bytes, 0), shape.getNumberOfBits()); + + final OfInt iter = hasher.iterator(shape); + + assertTrue(iter.hasNext()); + assertEquals(expected, iter.nextInt()); + assertFalse(iter.hasNext()); + } + + /** + * Tests that build resets the builder. + */ + @Test + public void buildResetTest() { + builder.with(new byte[] {123}); + final OfInt iter = builder.build().iterator(shape); + + assertTrue(iter.hasNext()); + iter.next(); + assertFalse(iter.hasNext()); + + // Nothing added since last build so it should be an empty hasher + final OfInt iter2 = builder.build().iterator(shape); + assertFalse(iter2.hasNext()); + } + /** * Sets up the builder for testing. */ @Before public void setup() { - builder = new DynamicHasher.Builder(new MD5Cyclic()); + builder = new DynamicHasher.Builder(hf); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java index e79f6b7fe..317bf95d2 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/DynamicHasherTest.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.nio.charset.StandardCharsets; import java.util.NoSuchElementException; import java.util.PrimitiveIterator.OfInt; @@ -80,7 +81,7 @@ public class DynamicHasherTest { final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62}; - final Hasher hasher = builder.with("Hello").build(); + final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); final OfInt iter = hasher.iterator(shape); @@ -99,7 +100,7 @@ public class DynamicHasherTest { final int[] expected = {6, 69, 44, 19, 10, 57, 48, 23, 70, 61, 36, 11, 2, 49, 24, 15, 62, 1, 63, 53, 43, 17, 7, 69, 59, 49, 39, 13, 3, 65, 55, 45, 35, 25}; - final Hasher hasher = builder.with("Hello").with("World").build(); + final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).with("World", StandardCharsets.UTF_8).build(); final OfInt iter = hasher.iterator(shape); @@ -122,7 +123,7 @@ public class DynamicHasherTest { @Test public void testGetBits_WrongShape() { - final Hasher hasher = builder.with("Hello").build(); + final Hasher hasher = builder.with("Hello", StandardCharsets.UTF_8).build(); try { hasher.iterator(new Shape(testFunction, 3, 72, 17)); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java new file mode 100644 index 000000000..767b54a6c --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/hasher/HasherBuilderTest.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder; +import org.apache.commons.lang3.NotImplementedException; +import org.junit.Assert; +import org.junit.Test; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; + +/** + * Tests the + * {@link org.apache.commons.collections4.bloomfilter.hasher.Hasher.Builder Hasher.Builder}. + */ +public class HasherBuilderTest { + + /** + * Simple class to collect byte[] items added to the builder. + */ + private static class TestBuilder implements Hasher.Builder { + ArrayList items = new ArrayList<>(); + + @Override + public Hasher build() { + throw new NotImplementedException("Not required"); + } + + @Override + public Builder with(byte[] item) { + items.add(item); + return this; + } + } + + /** + * Tests that adding CharSequence items works correctly. + */ + @Test + public void withCharSequenceTest() { + final String ascii = "plain"; + final String extended = getExtendedString(); + for (final String s : new String[] {ascii, extended}) { + for (final Charset cs : new Charset[] { + StandardCharsets.ISO_8859_1, StandardCharsets.UTF_8, StandardCharsets.UTF_16 + }) { + TestBuilder builder = new TestBuilder(); + builder.with(s, cs); + Assert.assertArrayEquals(s.getBytes(cs), builder.items.get(0)); + } + } + } + + /** + * Tests that adding unencoded CharSequence items works correctly. + */ + @Test + public void withUnecodedCharSequenceTest() { + final String ascii = "plain"; + final String extended = getExtendedString(); + for (final String s : new String[] {ascii, extended}) { + final TestBuilder builder = new TestBuilder(); + builder.withUnencoded(s); + final byte[] encoded = builder.items.get(0); + final char[] original = s.toCharArray(); + // Should be twice the length + Assert.assertEquals(original.length * 2, encoded.length); + // Should be little endian (lower bits first) + final CharBuffer buffer = ByteBuffer.wrap(encoded) + .order(ByteOrder.LITTLE_ENDIAN).asCharBuffer(); + for (int i = 0; i < original.length; i++) { + Assert.assertEquals(original[i], buffer.get(i)); + } + } + } + + /** + * Gets a string with non-standard characters. + * + * @return the extended string + */ + static String getExtendedString() { + final char[] data = {'e', 'x', 't', 'e', 'n', 'd', 'e', 'd', ' ', + // Add some characters that are non standard + // non-ascii + 0xCA98, + // UTF-16 surrogate pair + 0xD803, 0xDE6D + // Add other cases here ... + }; + return String.valueOf(data); + } +}