parent
4ca15c909e
commit
0144471d0f
|
@ -345,6 +345,12 @@
|
||||||
<artifactId>opennlp-tools</artifactId>
|
<artifactId>opennlp-tools</artifactId>
|
||||||
<version>1.8.0</version>
|
<version>1.8.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>info.debatty</groupId>
|
||||||
|
<artifactId>java-lsh</artifactId>
|
||||||
|
<version>${java-lsh.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<properties>
|
<properties>
|
||||||
|
@ -371,6 +377,7 @@
|
||||||
<serenity.plugin.version>1.4.0</serenity.plugin.version>
|
<serenity.plugin.version>1.4.0</serenity.plugin.version>
|
||||||
<jUnitParams.version>1.1.0</jUnitParams.version>
|
<jUnitParams.version>1.1.0</jUnitParams.version>
|
||||||
<netty.version>4.1.10.Final</netty.version>
|
<netty.version>4.1.10.Final</netty.version>
|
||||||
|
<java-lsh.version>0.10</java-lsh.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
</project>
|
</project>
|
|
@ -0,0 +1,48 @@
|
||||||
|
package com.baeldung.lsh;
|
||||||
|
|
||||||
|
import info.debatty.java.lsh.LSHMinHash;
|
||||||
|
import org.junit.Ignore;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
|
||||||
|
public class LocalSensitiveHashingUnitTest {
|
||||||
|
|
||||||
|
@Ignore("for simplicity of the example number of input vectors is very low, that's why LSH may yield non deterministic results")
|
||||||
|
@Test()
|
||||||
|
public void givenNVectors_whenPerformLSH_thenShouldCalculateSameHashForSimilarVectors() {
|
||||||
|
//given
|
||||||
|
boolean[] vector1 = new boolean[]{true, true, true, true, true};
|
||||||
|
boolean[] vector2 = new boolean[]{false, false, false, true, false};
|
||||||
|
boolean[] vector3 = new boolean[]{false, false, true, true, false};
|
||||||
|
|
||||||
|
int sizeOfVectors = 5;
|
||||||
|
int numberOfBuckets = 10;
|
||||||
|
int stages = 4;
|
||||||
|
|
||||||
|
LSHMinHash lsh = new LSHMinHash(stages, numberOfBuckets, sizeOfVectors);
|
||||||
|
|
||||||
|
//when
|
||||||
|
int[] firstHash = lsh.hash(vector1);
|
||||||
|
int[] secondHash = lsh.hash(vector2);
|
||||||
|
int[] thirdHash = lsh.hash(vector3);
|
||||||
|
|
||||||
|
System.out.println(Arrays.toString(firstHash));
|
||||||
|
System.out.println(Arrays.toString(secondHash));
|
||||||
|
System.out.println(Arrays.toString(thirdHash));
|
||||||
|
|
||||||
|
//then
|
||||||
|
int lastIndexOfResult = stages - 1;
|
||||||
|
assertThat(firstHash[lastIndexOfResult]).isNotEqualTo(secondHash[lastIndexOfResult]);
|
||||||
|
assertThat(firstHash[lastIndexOfResult]).isNotEqualTo(thirdHash[lastIndexOfResult]);
|
||||||
|
assertThat(isCloseOrEqual(secondHash[lastIndexOfResult], thirdHash[lastIndexOfResult], numberOfBuckets)).isTrue();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isCloseOrEqual(int secondHash, int thirdHash, int numberOfBuckets) {
|
||||||
|
return Math.abs(secondHash - thirdHash) < numberOfBuckets / 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue