5080 new similarity algorithm to handle numeric normalization (#5082)
* solution * test * docs + changelog * fix formatting --------- Co-authored-by: justindar <justin.dar@smilecdr.com>
This commit is contained in:
parent
2fbd3fa272
commit
e02468b552
|
@ -0,0 +1,7 @@
|
||||||
|
---
|
||||||
|
type: add
|
||||||
|
issue: 5080
|
||||||
|
title: "Extended the existing MDM similarity algorithms to numeric values such that the input is normalized
|
||||||
|
by removing all non-numeric characters from the string before the similarity algorithm is applied. This can
|
||||||
|
be useful when wanting to measure similarity between identifying numbers or phone numbers where dashes or
|
||||||
|
other special separating characters may be used."
|
|
@ -616,6 +616,46 @@ The following algorithms are currently supported:
|
||||||
</td>
|
</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>NUMERIC_JARO_WINKLER</td>
|
||||||
|
<td>similarity</td>
|
||||||
|
<td>
|
||||||
|
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#jaro-winkler">tdebatty Jaro Winkler</a>
|
||||||
|
</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>NUMERIC_COSINE</td>
|
||||||
|
<td>similarity</td>
|
||||||
|
<td>
|
||||||
|
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#cosine-similarity">tdebatty Cosine Similarity</a>
|
||||||
|
</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>NUMERIC_JACCARD</td>
|
||||||
|
<td>similarity</td>
|
||||||
|
<td>
|
||||||
|
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#jaccard-index">tdebatty Jaccard Index</a>
|
||||||
|
</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>NUMERIC_LEVENSCHTEIN</td>
|
||||||
|
<td>similarity</td>
|
||||||
|
<td>
|
||||||
|
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#normalized-levenshtein">tdebatty Normalized Levenshtein</a>
|
||||||
|
</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>NUMERIC_SORENSEN_DICE</td>
|
||||||
|
<td>similarity</td>
|
||||||
|
<td>
|
||||||
|
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#sorensen-dice-coefficient">tdebatty Sorensen-Dice coefficient</a>
|
||||||
|
</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
package ca.uhn.fhir.mdm.rules.similarity;
|
||||||
|
|
||||||
|
import ca.uhn.fhir.context.FhirContext;
|
||||||
|
import ca.uhn.fhir.context.phonetic.NumericEncoder;
|
||||||
|
import ca.uhn.fhir.mdm.rules.matcher.util.StringMatcherUtils;
|
||||||
|
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
|
||||||
|
import org.hl7.fhir.instance.model.api.IBase;
|
||||||
|
import org.hl7.fhir.instance.model.api.IPrimitiveType;
|
||||||
|
|
||||||
|
public class HapiNumericSimilarity extends HapiStringSimilarity {
|
||||||
|
private final NumericEncoder encoder = new NumericEncoder();
|
||||||
|
|
||||||
|
public HapiNumericSimilarity(NormalizedStringSimilarity theStringSimilarity) {
|
||||||
|
super(theStringSimilarity);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double similarity(FhirContext theFhirContext, IBase theLeftBase, IBase theRightBase, boolean theExact) {
|
||||||
|
|
||||||
|
if (theLeftBase instanceof IPrimitiveType && theRightBase instanceof IPrimitiveType) {
|
||||||
|
String leftString =
|
||||||
|
encoder.encode(StringMatcherUtils.extractString((IPrimitiveType<?>) theLeftBase, theExact));
|
||||||
|
String rightString =
|
||||||
|
encoder.encode(StringMatcherUtils.extractString((IPrimitiveType<?>) theRightBase, theExact));
|
||||||
|
|
||||||
|
return myStringSimilarity.similarity(leftString, rightString);
|
||||||
|
}
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -29,7 +29,7 @@ import org.hl7.fhir.instance.model.api.IPrimitiveType;
|
||||||
* Similarity measure for two IBase fields whose similarity can be measured by their String representations.
|
* Similarity measure for two IBase fields whose similarity can be measured by their String representations.
|
||||||
*/
|
*/
|
||||||
public class HapiStringSimilarity implements IMdmFieldSimilarity {
|
public class HapiStringSimilarity implements IMdmFieldSimilarity {
|
||||||
private final NormalizedStringSimilarity myStringSimilarity;
|
protected final NormalizedStringSimilarity myStringSimilarity;
|
||||||
|
|
||||||
public HapiStringSimilarity(NormalizedStringSimilarity theStringSimilarity) {
|
public HapiStringSimilarity(NormalizedStringSimilarity theStringSimilarity) {
|
||||||
myStringSimilarity = theStringSimilarity;
|
myStringSimilarity = theStringSimilarity;
|
||||||
|
|
|
@ -35,7 +35,12 @@ public enum MdmSimilarityEnum {
|
||||||
COSINE(new HapiStringSimilarity(new Cosine())),
|
COSINE(new HapiStringSimilarity(new Cosine())),
|
||||||
JACCARD(new HapiStringSimilarity(new Jaccard())),
|
JACCARD(new HapiStringSimilarity(new Jaccard())),
|
||||||
LEVENSCHTEIN(new HapiStringSimilarity(new NormalizedLevenshtein())),
|
LEVENSCHTEIN(new HapiStringSimilarity(new NormalizedLevenshtein())),
|
||||||
SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice()));
|
SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice())),
|
||||||
|
NUMERIC_JARO_WINKLER(new HapiNumericSimilarity(new JaroWinkler())),
|
||||||
|
NUMERIC_COSINE(new HapiNumericSimilarity(new Cosine())),
|
||||||
|
NUMERIC_JACCARD(new HapiNumericSimilarity(new Jaccard())),
|
||||||
|
NUMERIC_LEVENSCHTEIN(new HapiNumericSimilarity(new NormalizedLevenshtein())),
|
||||||
|
NUMERIC_SORENSEN_DICE(new HapiNumericSimilarity(new SorensenDice()));
|
||||||
|
|
||||||
private final IMdmFieldSimilarity myMdmFieldSimilarity;
|
private final IMdmFieldSimilarity myMdmFieldSimilarity;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
package ca.uhn.fhir.mdm.rules.matcher;
|
||||||
|
|
||||||
|
import ca.uhn.fhir.mdm.rules.similarity.HapiNumericSimilarity;
|
||||||
|
import ca.uhn.fhir.mdm.rules.similarity.HapiStringSimilarity;
|
||||||
|
import ca.uhn.fhir.mdm.rules.similarity.IMdmFieldSimilarity;
|
||||||
|
import info.debatty.java.stringsimilarity.Cosine;
|
||||||
|
import info.debatty.java.stringsimilarity.Jaccard;
|
||||||
|
import info.debatty.java.stringsimilarity.JaroWinkler;
|
||||||
|
import info.debatty.java.stringsimilarity.NormalizedLevenshtein;
|
||||||
|
import info.debatty.java.stringsimilarity.SorensenDice;
|
||||||
|
import org.hl7.fhir.r4.model.StringType;
|
||||||
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.CsvSource;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
public class NumericSimilarityR4Test extends StringSimilarityR4Test {
|
||||||
|
private static final HapiStringSimilarity NUMERIC_JARO_WINKLER = new HapiNumericSimilarity(new JaroWinkler());
|
||||||
|
private static final HapiStringSimilarity NUMERIC_COSINE = new HapiNumericSimilarity(new Cosine());
|
||||||
|
private static final HapiStringSimilarity NUMERIC_JACCARD = new HapiNumericSimilarity(new Jaccard());
|
||||||
|
private static final HapiStringSimilarity NUMERIC_LEVENSCHTEIN = new HapiNumericSimilarity(new NormalizedLevenshtein());
|
||||||
|
private static final HapiStringSimilarity NUMERIC_SORENSEN_DICE = new HapiNumericSimilarity(new SorensenDice());
|
||||||
|
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@CsvSource({
|
||||||
|
"123-45-6789, 123456789",
|
||||||
|
"1234-5-6789, 123456789",
|
||||||
|
"abc123, 123",
|
||||||
|
"(416) 967-1111, 4169671111,"
|
||||||
|
})
|
||||||
|
public void testNumericSimilarity_withExactMatches(String theLeft, String theRight) {
|
||||||
|
assertEquals(1.0, similarity(NUMERIC_JARO_WINKLER, theLeft, theRight));
|
||||||
|
assertEquals(1.0, similarity(NUMERIC_COSINE, theLeft, theRight));
|
||||||
|
assertEquals(1.0, similarity(NUMERIC_JACCARD, theLeft, theRight));
|
||||||
|
assertEquals(1.0, similarity(NUMERIC_LEVENSCHTEIN, theLeft, theRight));
|
||||||
|
assertEquals(1.0, similarity(NUMERIC_SORENSEN_DICE, theLeft, theRight));
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@CsvSource({
|
||||||
|
"123546789, 123-54-6789, 123456789",
|
||||||
|
"123456789, 1234-5-6789, 123456789",
|
||||||
|
"321, abc321, 123",
|
||||||
|
"1231231234, (123) 123-1234, 1231234321,"
|
||||||
|
})
|
||||||
|
public void testNumericSimilarity_givesSameResultAsStringSimilarity(String theLeft, String theLeftWithNonNumerics, String theRight) {
|
||||||
|
assertEquals(similarity(JARO_WINKLER, theLeft, theRight), similarity(NUMERIC_JARO_WINKLER, theLeftWithNonNumerics, theRight));
|
||||||
|
assertEquals(similarity(COSINE, theLeft, theRight), similarity(NUMERIC_COSINE, theLeftWithNonNumerics, theRight));
|
||||||
|
assertEquals(similarity(JACCARD, theLeft, theRight), similarity(NUMERIC_JACCARD, theLeftWithNonNumerics, theRight));
|
||||||
|
assertEquals(similarity(LEVENSCHTEIN, theLeft, theRight), similarity(NUMERIC_LEVENSCHTEIN, theLeftWithNonNumerics, theRight));
|
||||||
|
assertEquals(similarity(SORENSEN_DICE, theLeft, theRight), similarity(NUMERIC_SORENSEN_DICE, theLeftWithNonNumerics, theRight));
|
||||||
|
}
|
||||||
|
|
||||||
|
private double similarity(IMdmFieldSimilarity theSimilarity, String theLeft, String theRight) {
|
||||||
|
return theSimilarity.similarity(ourFhirContext, new StringType(theLeft), new StringType(theRight), false);
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,11 +17,11 @@ public class StringSimilarityR4Test extends BaseMatcherR4Test {
|
||||||
public static final String LEFT = "somon";
|
public static final String LEFT = "somon";
|
||||||
public static final String RIGHT = "slomon";
|
public static final String RIGHT = "slomon";
|
||||||
|
|
||||||
private static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler());
|
protected static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler());
|
||||||
private static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine());
|
protected static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine());
|
||||||
private static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard());
|
protected static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard());
|
||||||
private static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein());
|
protected static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein());
|
||||||
private static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice());
|
protected static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice());
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSlomon() {
|
public void testSlomon() {
|
||||||
|
|
Loading…
Reference in New Issue