From 0144471d0fbb5597018c0faf008e1fa205591d54 Mon Sep 17 00:00:00 2001 From: Tomasz Lelek Date: Sun, 18 Jun 2017 03:35:33 +0200 Subject: [PATCH] BAEL-980 lsh code (#2047) * BAEL-980 lsh code * BAEL-980 rename test --- libraries/pom.xml | 7 +++ .../lsh/LocalSensitiveHashingUnitTest.java | 48 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 libraries/src/test/java/com/baeldung/lsh/LocalSensitiveHashingUnitTest.java diff --git a/libraries/pom.xml b/libraries/pom.xml index da040c9731..12511a243a 100644 --- a/libraries/pom.xml +++ b/libraries/pom.xml @@ -345,6 +345,12 @@ opennlp-tools 1.8.0 + + info.debatty + java-lsh + ${java-lsh.version} + + @@ -371,6 +377,7 @@ 1.4.0 1.1.0 4.1.10.Final + 0.10 \ No newline at end of file diff --git a/libraries/src/test/java/com/baeldung/lsh/LocalSensitiveHashingUnitTest.java b/libraries/src/test/java/com/baeldung/lsh/LocalSensitiveHashingUnitTest.java new file mode 100644 index 0000000000..fbcda2a70d --- /dev/null +++ b/libraries/src/test/java/com/baeldung/lsh/LocalSensitiveHashingUnitTest.java @@ -0,0 +1,48 @@ +package com.baeldung.lsh; + +import info.debatty.java.lsh.LSHMinHash; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.Arrays; + +import static org.assertj.core.api.Assertions.assertThat; + + +public class LocalSensitiveHashingUnitTest { + + @Ignore("for simplicity of the example number of input vectors is very low, that's why LSH may yield non deterministic results") + @Test() + public void givenNVectors_whenPerformLSH_thenShouldCalculateSameHashForSimilarVectors() { + //given + boolean[] vector1 = new boolean[]{true, true, true, true, true}; + boolean[] vector2 = new boolean[]{false, false, false, true, false}; + boolean[] vector3 = new boolean[]{false, false, true, true, false}; + + int sizeOfVectors = 5; + int numberOfBuckets = 10; + int stages = 4; + + LSHMinHash lsh = new LSHMinHash(stages, numberOfBuckets, sizeOfVectors); + + //when + int[] firstHash = lsh.hash(vector1); + int[] secondHash = lsh.hash(vector2); + int[] thirdHash = lsh.hash(vector3); + + System.out.println(Arrays.toString(firstHash)); + System.out.println(Arrays.toString(secondHash)); + System.out.println(Arrays.toString(thirdHash)); + + //then + int lastIndexOfResult = stages - 1; + assertThat(firstHash[lastIndexOfResult]).isNotEqualTo(secondHash[lastIndexOfResult]); + assertThat(firstHash[lastIndexOfResult]).isNotEqualTo(thirdHash[lastIndexOfResult]); + assertThat(isCloseOrEqual(secondHash[lastIndexOfResult], thirdHash[lastIndexOfResult], numberOfBuckets)).isTrue(); + } + + private boolean isCloseOrEqual(int secondHash, int thirdHash, int numberOfBuckets) { + return Math.abs(secondHash - thirdHash) < numberOfBuckets / 2; + } +} +